aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorUros Bizjak <uros@gcc.gnu.org>2016-05-04 23:13:13 +0200
committerUros Bizjak <uros@gcc.gnu.org>2016-05-04 23:13:13 +0200
commitfdab73dc76d4551c652a3f3d686e765e637c95d9 (patch)
tree41ea71f3e9ca26ed18e279f6cc0d0911c7ea9b98 /gcc
parentd07d21777f637293ebf91bff6377f2621a1e7a0c (diff)
downloadgcc-fdab73dc76d4551c652a3f3d686e765e637c95d9.zip
gcc-fdab73dc76d4551c652a3f3d686e765e637c95d9.tar.gz
gcc-fdab73dc76d4551c652a3f3d686e765e637c95d9.tar.bz2
re PR target/70873 ([7 Regressio] 20% performance regression at 482.sphinx3 after r235442 with -O2 -m32 on Haswell.)
PR target/70873 * config/i386/i386.md (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2): Change to post-epilogue_completed late splitter. Use sse_reg_operand as operand 0 predicate. (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2): Ditto. (TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2): Ditto. Emit the pattern using RTX. (TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter): Use sse_reg_opreand as operand 0 predicate. Do not use true_regnum in the post-reload splitter. Use lowpart_subreg instead of gen_rtx_REG. (TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter): Ditto. (TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use sse_reg_operand as operand 0 predicate. (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2): Use sse_reg_opreand as operand 0 predicate. Use lowpart_subreg instead of gen_rtx_REG. (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2): Ditto. From-SVN: r235906
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog32
-rw-r--r--gcc/config/i386/i386.md101
2 files changed, 77 insertions, 56 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 45e9087..772dd37 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,29 @@
+2016-05-04 Uros Bizjak <ubizjak@gmail.com>
+
+ PR target/70873
+ * config/i386/i386.md
+ (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2):
+ Change to post-epilogue_completed late splitter. Use sse_reg_operand
+ as operand 0 predicate.
+ (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2):
+ Ditto.
+ (TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2):
+ Ditto. Emit the pattern using RTX.
+
+ (TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter):
+ Use sse_reg_opreand as operand 0 predicate. Do not use true_regnum in
+ the post-reload splitter. Use lowpart_subreg instead of gen_rtx_REG.
+ (TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter):
+ Ditto.
+ (TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use
+ sse_reg_operand as operand 0 predicate.
+
+ (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2):
+ Use sse_reg_opreand as operand 0 predicate. Use lowpart_subreg
+ instead of gen_rtx_REG.
+ (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2):
+ Ditto.
+
2016-05-04 Segher Boessenkool <segher@kernel.crashing.org>
* function.c (emit_use_return_register_into_block): Delete.
@@ -94,8 +120,7 @@
* match.pd: Add BIT_FIELD_REF canonicalizations and vector
constructor simplifications.
- * fold-const.c (fold_ternary_loc): Remove duplicate functionality
- here.
+ * fold-const.c (fold_ternary_loc): Remove duplicate functionality here.
2016-05-04 Oleg Endo <olegendo@gcc.gnu.org>
@@ -219,8 +244,7 @@
2016-05-03 Jakub Jelinek <jakub@redhat.com>
* config/i386/i386.md (*truncdfsf_mixed, *truncdfsf_i387,
- *truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead
- of x.
+ *truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead of x.
2016-05-03 Richard Biener <rguenther@suse.de>
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index ba1ff8b..dd56b05 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4231,12 +4231,12 @@
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
- [(set (match_operand:DF 0 "register_operand")
+ [(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
"TARGET_USE_VECTOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
- && reload_completed && SSE_REG_P (operands[0])
+ && reload_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
[(set (match_dup 2)
@@ -4253,13 +4253,11 @@
{
/* If it is unsafe to overwrite upper half of source, we need
to move to destination and unpack there. */
- if (((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
- || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
- && true_regnum (operands[0]) != true_regnum (operands[1]))
+ if (REGNO (operands[0]) != REGNO (operands[1])
|| (EXT_REX_SSE_REG_P (operands[1])
&& !TARGET_AVX512VL))
{
- rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+ rtx tmp = lowpart_subreg (SFmode, operands[0], DFmode);
emit_move_insn (tmp, operands[1]);
}
else
@@ -4267,7 +4265,7 @@
/* FIXME: vec_interleave_lowv4sf for AVX512VL should allow
=v, v, then vbroadcastss will be only needed for AVX512F without
AVX512VL. */
- if (!EXT_REX_SSE_REGNO_P (true_regnum (operands[3])))
+ if (!EXT_REX_SSE_REGNO_P (REGNO (operands[3])))
emit_insn (gen_vec_interleave_lowv4sf (operands[3], operands[3],
operands[3]));
else
@@ -4283,15 +4281,14 @@
;; It's more profitable to split and then extend in the same register.
(define_peephole2
- [(set (match_operand:DF 0 "register_operand")
+ [(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
- && optimize_insn_for_speed_p ()
- && SSE_REG_P (operands[0])"
+ && optimize_insn_for_speed_p ()"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_extend:DF (match_dup 2)))]
- "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
+ "operands[2] = lowpart_subreg (SFmode, operands[0], DFmode);")
(define_insn "*extendsfdf2"
[(set (match_operand:DF 0 "nonimm_ssenomem_operand" "=f,m,v")
@@ -4390,12 +4387,12 @@
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
- [(set (match_operand:SF 0 "register_operand")
+ [(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
"TARGET_USE_VECTOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
- && reload_completed && SSE_REG_P (operands[0])
+ && reload_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
[(set (match_dup 2)
@@ -4413,9 +4410,7 @@
if (REG_P (operands[1]))
{
if (!TARGET_SSE3
- && true_regnum (operands[0]) != true_regnum (operands[1])
- && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
- || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+ && REGNO (operands[0]) != REGNO (operands[1]))
{
rtx tmp = lowpart_subreg (DFmode, operands[0], SFmode);
emit_move_insn (tmp, operands[1]);
@@ -4432,15 +4427,14 @@
;; It's more profitable to split and then extend in the same register.
(define_peephole2
- [(set (match_operand:SF 0 "register_operand")
+ [(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
- && optimize_insn_for_speed_p ()
- && SSE_REG_P (operands[0])"
+ && optimize_insn_for_speed_p ()"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_truncate:SF (match_dup 2)))]
- "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
+ "operands[2] = lowpart_subreg (DFmode, operands[0], SFmode);")
(define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0)
@@ -4547,7 +4541,7 @@
"reload_completed"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (match_dup 2))]
- "operands[1] = gen_rtx_REG (SFmode, true_regnum (operands[1]));")
+ "operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));")
;; Conversion from XFmode to {SF,DF}mode
@@ -5153,11 +5147,11 @@
;; slots when !TARGET_INTER_UNIT_MOVES_TO_VEC disables the general_regs
;; alternative in sse2_loadld.
(define_split
- [(set (match_operand:MODEF 0 "register_operand")
+ [(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SI 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
- && reload_completed && SSE_REG_P (operands[0])
+ "TARGET_USE_VECTOR_CONVERTS
+ && optimize_function_for_speed_p (cfun)
+ && reload_completed
&& (MEM_P (operands[1]) || TARGET_INTER_UNIT_MOVES_TO_VEC)
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
@@ -5176,41 +5170,43 @@
DONE;
})
-;; Avoid partial SSE register dependency stalls
+;; Avoid partial SSE register dependency stalls. This splitter should split
+;; late in the pass sequence (after register rename pass), so allocated
+;; registers won't change anymore
+
(define_split
- [(set (match_operand:MODEF 0 "register_operand")
+ [(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ "TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
- && reload_completed && SSE_REG_P (operands[0])
+ && epilogue_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
- [(const_int 0)]
+ [(set (match_dup 0)
+ (vec_merge:<MODEF:ssevecmode>
+ (vec_duplicate:<MODEF:ssevecmode>
+ (float:MODEF
+ (match_dup 1)))
+ (match_dup 0)
+ (const_int 1)))]
{
const machine_mode vmode = <MODEF:ssevecmode>mode;
- const machine_mode mode = <MODEF:MODE>mode;
- rtx t, op0 = lowpart_subreg (vmode, operands[0], mode);
-
- emit_move_insn (op0, CONST0_RTX (vmode));
- t = gen_rtx_FLOAT (mode, operands[1]);
- t = gen_rtx_VEC_DUPLICATE (vmode, t);
- t = gen_rtx_VEC_MERGE (vmode, t, op0, const1_rtx);
- emit_insn (gen_rtx_SET (op0, t));
- DONE;
+ operands[0] = lowpart_subreg (vmode, operands[0], <MODEF:MODE>mode);
+ emit_move_insn (operands[0], CONST0_RTX (vmode));
})
-;; Break partial reg stall for cvtsd2ss.
+;; Break partial reg stall for cvtsd2ss. This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
-(define_peephole2
- [(set (match_operand:SF 0 "register_operand")
+(define_split
+ [(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ "TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
- && SSE_REG_P (operands[0])
+ && epilogue_completed
&& (!SSE_REG_P (operands[1])
|| REGNO (operands[0]) != REGNO (operands[1]))
&& (!EXT_REX_SSE_REG_P (operands[0])
@@ -5228,16 +5224,17 @@
emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
})
-;; Break partial reg stall for cvtss2sd.
+;; Break partial reg stall for cvtss2sd. This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
-(define_peephole2
- [(set (match_operand:DF 0 "register_operand")
+(define_split
+ [(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ "TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
- && SSE_REG_P (operands[0])
+ && epilogue_completed
&& (!SSE_REG_P (operands[1])
|| REGNO (operands[0]) != REGNO (operands[1]))
&& (!EXT_REX_SSE_REG_P (operands[0])