aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/ChangeLog17
-rw-r--r--gcc/config/i386/i386.c4
-rw-r--r--gcc/config/i386/i386.h2
-rw-r--r--gcc/config/i386/i386.md292
-rw-r--r--gcc/config/i386/sse.md2
5 files changed, 296 insertions, 21 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index ff54c46..d748be2 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,20 @@
+2007-09-09 Jan Hubicka <jh@suse.cz>
+ Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
+
+ * i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.
+ (TARGET_USE_VECTOR_CONVERTS): New.
+ * i386.md: New post-reload splitters for converting SF to DF and DF to
+ SF.
+ (floatsi* expander): Special case vector conversions.
+ (floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit,
+ floatsisf2_sse_vector_internunit, floatsisf2_sse_vector,
+ floatsidf2_mixed_vector, floatsidf2_sse_vector): New.
+ (floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse):
+ Disable when doing vector converts.
+ (floatsi<mode>2_i387): Disable when
+ * sse.md (vec_dupv2df): Export.
+ * i386.c (ix86_tune_features): Enable SSE conversions.
+
2007-09-09 Richard Guenther <rguenther@suse.de>
* tree-ssa-operands.c (add_virtual_operand): Only mark
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index f6f80a0..c01198b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1258,6 +1258,10 @@ unsigned int ix86_tune_features[X86_TUNE_LAST] = {
operand that cannot be represented using a modRM byte. The XOR
replacement is long decoded, so this split helps here as well. */
m_K6,
+
+ /* X86_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion from
+ integer to FP. */
+ m_AMDFAM10,
};
/* Feature tests against the various architecture variations. */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 93e24dd..06e90f4 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -257,6 +257,7 @@ enum ix86_tune_indices {
X86_TUNE_MOVE_M1_VIA_OR,
X86_TUNE_NOT_UNPAIRABLE,
X86_TUNE_NOT_VECTORMODE,
+ X86_USE_VECTOR_CONVERTS,
X86_TUNE_LAST
};
@@ -337,6 +338,7 @@ extern unsigned int ix86_tune_features[X86_TUNE_LAST];
#define TARGET_MOVE_M1_VIA_OR ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR]
#define TARGET_NOT_UNPAIRABLE ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE]
#define TARGET_NOT_VECTORMODE ix86_tune_features[X86_TUNE_NOT_VECTORMODE]
+#define TARGET_USE_VECTOR_CONVERTS ix86_tune_features[X86_USE_VECTOR_CONVERTS]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 42b3bab..352f67d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3916,6 +3916,49 @@
}
})
+/* For converting SF(xmm2) to DF(xmm1), use the following code instead of
+ cvtss2sd:
+ unpcklps xmm2,xmm2 ; packed conversion might crash on signaling NaNs
+ cvtps2pd xmm2,xmm1
+ We do the conversion post reload to avoid producing of 128bit spills
+ that might lead to ICE on 32bit target. The sequence unlikely combine
+ anyway. */
+(define_split
+ [(set (match_operand:DF 0 "register_operand" "")
+ (float_extend:DF
+ (match_operand:SF 1 "nonimmediate_operand" "")))]
+ "TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && reload_completed && SSE_REG_P (operands[0])"
+ [(set (match_dup 2)
+ (float_extend:V2DF
+ (vec_select:V2SF
+ (match_dup 3)
+ (parallel [(const_int 0) (const_int 1)]))))]
+{
+ operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+ operands[3] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0);
+ /* Use movss for loading from memory, unpcklps reg, reg for registers.
+ Try to avoid move when unpacking can be done in source. */
+ if (REG_P (operands[1]))
+ {
+ /* If it is unsafe to overwrite upper half of source, we need
+ to move to destination and unpack there. */
+ if ((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+ || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
+ && true_regnum (operands[0]) != true_regnum (operands[1]))
+ {
+ rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+ emit_move_insn (tmp, operands[1]);
+ }
+ else
+ operands[3] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+ emit_insn (gen_sse_unpcklps (operands[3], operands[3], operands[3]));
+ }
+ else
+ emit_insn (gen_vec_setv4sf_0 (operands[3],
+ CONST0_RTX (V4SFmode), operands[1]));
+})
+
(define_insn "*extendsfdf2_mixed"
[(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
(float_extend:DF
@@ -4009,6 +4052,51 @@
}
})
+/* For converting DF(xmm2) to SF(xmm1), use the following code instead of
+ cvtsd2ss:
+ unpcklpd xmm2,xmm2 ; packed conversion might crash on signaling NaNs
+ cvtpd2ps xmm2,xmm1
+ We do the conversion post reload to avoid producing of 128bit spills
+ that might lead to ICE on 32bit target. The sequence unlikely combine
+ anyway. */
+(define_split
+ [(set (match_operand:SF 0 "register_operand" "")
+ (float_truncate:SF
+ (match_operand:DF 1 "nonimmediate_operand" "")))]
+ "TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && reload_completed && SSE_REG_P (operands[0])"
+ [(set (match_dup 2)
+ (vec_concat:V4SF
+ (float_truncate:V2SF
+ (match_dup 4))
+ (match_dup 3)))]
+{
+ operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+ operands[3] = CONST0_RTX (V2SFmode);
+ operands[4] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0);
+ /* Use movsd for loading from memory, unpcklpd for registers.
+ Try to avoid move when unpacking can be done in source, or SSE3
+ movddup is available. */
+ if (REG_P (operands[1]))
+ {
+ if (!TARGET_SSE3
+ && true_regnum (operands[0]) != true_regnum (operands[1])
+ && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+ || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+ {
+ rtx tmp = simplify_gen_subreg (DFmode, operands[0], SFmode, 0);
+ emit_move_insn (tmp, operands[1]);
+ operands[1] = tmp;
+ }
+ else if (!TARGET_SSE3)
+ operands[4] = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0);
+ emit_insn (gen_vec_dupv2df (operands[4], operands[1]));
+ }
+ else
+ emit_insn (gen_sse2_loadlpd (operands[4],
+ CONST0_RTX (V2DFmode), operands[1]));
+})
+
(define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0 "" "")
(float_truncate:SF (match_operand:DF 1 "" "")))
@@ -4685,12 +4773,67 @@
[(set (match_operand:MODEF 0 "register_operand" "")
(float:MODEF (match_operand:SI 1 "nonimmediate_operand" "")))]
"TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
- "")
+ "
+ /* When we use vector converts, we can't have input in memory. */
+ if (GET_MODE (operands[0]) == DFmode && GET_MODE (operands[1]) == SImode
+ && TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH
+ && SSE_FLOAT_MODE_P (DFmode))
+ operands[1] = force_reg (SImode, operands[1]);
+
+ if (GET_MODE (operands[0]) == SFmode && GET_MODE (operands[1]) == SImode
+ && !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH
+ && SSE_FLOAT_MODE_P (SFmode))
+ {
+ /* When !flag_trapping_math, we handle SImode->SFmode vector
+ conversions same way as SImode->DFmode.
+
+ For flat_trapping_math we can't safely use vector conversion without
+ clearing upper half, otherwise precision exception might occur.
+ However we can still generate the common sequence converting value
+ from general register to XMM register as:
+
+ mov reg32, mem32
+ movd mem32, xmm
+ cvtdq2pd xmm,xmm
+
+ because we know that movd clears the upper half.
+
+ Sadly in this case we can't rely on reload moving the value to XMM
+ register, since we need to know if upper half is OK, so we need
+ to do reloading by hand. We force operand to memory unless target
+ supports inter unit moves. */
+ if (!flag_trapping_math)
+ operands[1] = force_reg (SImode, operands[1]);
+ else if (!MEM_P (operands[1]))
+ {
+ rtx tmp = assign_386_stack_local (SImode, SLOT_VIRTUAL);
+ emit_move_insn (tmp, operands[1]);
+ operands[1] = tmp;
+ }
+ }
+ ")
+
+(define_insn "*floatsisf2_mixed_vector"
+ [(set (match_operand:SF 0 "register_operand" "=x,f,?f")
+ (float:SF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+ "TARGET_MIX_SSE_I387 && !flag_trapping_math
+ && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+ "@
+ cvtpq2ps\t{%1, %0|%0, %1}
+ fild%z1\t%1
+ #"
+ [(set_attr "type" "sseicvt,fmov,multi")
+ (set_attr "mode" "SF")
+ (set_attr "unit" "*,i387,*")
+ (set_attr "athlon_decode" "double,*,*")
+ (set_attr "amdfam10_decode" "double,*,*")
+ (set_attr "fp_int_src" "false,true,true")])
(define_insn "*floatsisf2_mixed"
[(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
- "TARGET_MIX_SSE_I387"
+ "TARGET_MIX_SSE_I387
+ && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"@
fild%z1\t%1
#
@@ -4703,10 +4846,68 @@
(set_attr "amdfam10_decode" "*,*,vector,double")
(set_attr "fp_int_src" "true")])
+(define_insn "*floatsisf2_sse_vector_nointernunit"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (float:SF (match_operand:SI 1 "memory_operand" "m")))]
+ "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && !TARGET_INTER_UNIT_MOVES"
+ "#"
+ [(set_attr "type" "multi")])
+
+(define_insn "*floatsisf2_sse_vector_internunit"
+ [(set (match_operand:SF 0 "register_operand" "=x,x")
+ (float:SF (match_operand:SI 1 "nonimmediate_operand" "rm,x")))]
+ "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && TARGET_INTER_UNIT_MOVES"
+ "#"
+ [(set_attr "type" "multi")])
+
+(define_split
+ [(set (match_operand:SF 0 "register_operand" "")
+ (float:SF (match_operand:SI 1 "nonimmediate_operand" "")))]
+ "flag_trapping_math
+ && TARGET_USE_VECTOR_CONVERTS && reload_completed
+ && (TARGET_INTER_UNIT_MOVES || MEM_P (operands[1]))
+ && !SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+ [(set (match_dup 0)
+ (float:V4SF (match_dup 2)))]
+{
+ operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+ operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+ emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
+(define_split
+ [(set (match_operand:SF 0 "register_operand" "")
+ (float:SF (match_operand:SI 1 "register_operand" "")))]
+ "flag_trapping_math
+ && TARGET_USE_VECTOR_CONVERTS && reload_completed
+ && SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+ [(set (match_dup 2) (vec_duplicate:V4SI (match_dup 1)))
+ (set (match_dup 0)
+ (float:V4SF (match_dup 2)))]
+{
+ operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+ operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+})
+
+(define_insn "*floatsisf2_sse_vector"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (float:SF (match_operand:SI 1 "register_operand" "x")))]
+ "!flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && !TARGET_INTER_UNIT_MOVES"
+ "cvtpq2ps\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "mode" "SF")
+ (set_attr "athlon_decode" "double")
+ (set_attr "amdfam10_decode" "double")
+ (set_attr "fp_int_src" "true")])
+
(define_insn "*floatsisf2_sse"
[(set (match_operand:SF 0 "register_operand" "=x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
- "TARGET_SSE_MATH"
+ "TARGET_SSE_MATH
+ && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"cvtsi2ss\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "SF")
@@ -4714,38 +4915,89 @@
(set_attr "amdfam10_decode" "vector,double")
(set_attr "fp_int_src" "true")])
+(define_insn "*floatsidf2_mixed_vector"
+ [(set (match_operand:DF 0 "register_operand" "=x,f,f")
+ (float:DF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+ "TARGET_SSE2 && TARGET_MIX_SSE_I387
+ && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+ "@
+ cvtdq2pd\t{%1, %0|%0, %1}
+ fild%z1\t%1
+ #"
+ [(set_attr "type" "sseicvt,fmov,multi")
+ (set_attr "mode" "V2DF,DF,DF")
+ (set_attr "unit" "*,*,i387")
+ (set_attr "athlon_decode" "double,*,*")
+ (set_attr "amdfam10_decode" "double,*,*")
+ (set_attr "fp_int_src" "false,true,true")])
+
(define_insn "*floatsidf2_mixed"
- [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x")
- (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
- "TARGET_SSE2 && TARGET_MIX_SSE_I387"
+ [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x")
+ (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))]
+ "TARGET_SSE2 && TARGET_MIX_SSE_I387
+ && (!TARGET_USE_VECTOR_CONVERTS || !optimize_size)"
"@
fild%z1\t%1
#
cvtsi2sd\t{%1, %0|%0, %1}
- cvtsi2sd\t{%1, %0|%0, %1}"
- [(set_attr "type" "fmov,multi,sseicvt,sseicvt")
- (set_attr "mode" "DF")
- (set_attr "unit" "*,i387,*,*")
- (set_attr "athlon_decode" "*,*,double,direct")
- (set_attr "amdfam10_decode" "*,*,vector,double")
+ cvtsi2sd\t{%1, %0|%0, %1}
+ cvtdq2pd\t{%1, %0|%0, %1}"
+ [(set_attr "type" "fmov,multi,sseicvt,sseicvt,sseicvt")
+ (set_attr "mode" "DF,DF,DF,DF,V2DF")
+ (set_attr "unit" "*,i387,*,*,*")
+ (set_attr "athlon_decode" "*,*,double,direct,double")
+ (set_attr "amdfam10_decode" "*,*,vector,double,double")
+ (set_attr "fp_int_src" "true,true,true,true,false")])
+
+(define_insn "*floatsidf2_sse_vector"
+ [(set (match_operand:DF 0 "register_operand" "=x")
+ (float:DF (match_operand:SI 1 "register_operand" "x")))]
+ "TARGET_SSE2 && TARGET_SSE_MATH
+ && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+ "cvtdq2pd\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "mode" "V2DF")
+ (set_attr "athlon_decode" "double")
+ (set_attr "amdfam10_decode" "double")
(set_attr "fp_int_src" "true")])
+(define_split
+ [(set (match_operand:DF 0 "register_operand" "")
+ (float:DF (match_operand:SI 1 "memory_operand" "")))]
+ "TARGET_USE_VECTOR_CONVERTS && reload_completed
+ && SSE_REG_P (operands[0])"
+ [(set (match_dup 0)
+ (float:V2DF
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 0) (const_int 1)]))))]
+{
+ operands[2] = simplify_gen_subreg (V4SImode, operands[0], DFmode, 0);
+ operands[0] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+ emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
(define_insn "*floatsidf2_sse"
- [(set (match_operand:DF 0 "register_operand" "=x,x")
- (float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
- "TARGET_SSE2 && TARGET_SSE_MATH"
- "cvtsi2sd\t{%1, %0|%0, %1}"
+ [(set (match_operand:DF 0 "register_operand" "=x,x,!x")
+ (float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))]
+ "TARGET_SSE2 && TARGET_SSE_MATH
+ && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
+ "@
+ cvtsi2sd\t{%1, %0|%0, %1}
+ cvtsi2sd\t{%1, %0|%0, %1}
+ cvtdq2pd\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
- (set_attr "mode" "DF")
- (set_attr "athlon_decode" "double,direct")
- (set_attr "amdfam10_decode" "vector,double")
+ (set_attr "mode" "DF,DF,V2DF")
+ (set_attr "athlon_decode" "double,direct,double")
+ (set_attr "amdfam10_decode" "vector,double,double")
(set_attr "fp_int_src" "true")])
(define_insn "*floatsi<mode>2_i387"
[(set (match_operand:MODEF 0 "register_operand" "=f,f")
(float:MODEF
(match_operand:SI 1 "nonimmediate_operand" "m,?r")))]
- "TARGET_80387"
+ "TARGET_80387
+ && (!TARGET_SSE_MATH || !SSE_FLOAT_MODE_P (GET_MODE (operands[0])))"
"@
fild%z1\t%1
#"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index cb63ab9..03b2577 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2740,7 +2740,7 @@
[(set_attr "type" "sselog1")
(set_attr "mode" "DF")])
-(define_insn "*vec_dupv2df"
+(define_insn "vec_dupv2df"
[(set (match_operand:V2DF 0 "register_operand" "=x")
(vec_duplicate:V2DF
(match_operand:DF 1 "register_operand" "0")))]