5 files changed, 296 insertions, 21 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index ff54c46..d748be2 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,20 @@
+2007-09-09  Jan Hubicka  <jh@suse.cz>
+            Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
+
+	* i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.
+	(TARGET_USE_VECTOR_CONVERTS): New.
+	* i386.md: New post-reload splitters for converting SF to DF and DF to
+	SF.
+	(floatsi* expander): Special case vector conversions.
+	(floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit,
+	floatsisf2_sse_vector_internunit, floatsisf2_sse_vector,
+	floatsidf2_mixed_vector, floatsidf2_sse_vector): New.
+	(floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse):
+	Disable when doing vector converts.
+	(floatsi<mode>2_i387): Disable when
+	* sse.md (vec_dupv2df): Export.
+	* i386.c (ix86_tune_features): Enable SSE conversions.
+
 2007-09-09  Richard Guenther  <rguenther@suse.de>
 
 	* tree-ssa-operands.c (add_virtual_operand): Only mark
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index f6f80a0..c01198b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1258,6 +1258,10 @@ unsigned int ix86_tune_features[X86_TUNE_LAST] = {
      operand that cannot be represented using a modRM byte.  The XOR
      replacement is long decoded, so this split helps here as well.  */
   m_K6,
+
+  /* X86_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion from
+  integer to FP. */
+  m_AMDFAM10,
 };
 
 /* Feature tests against the various architecture variations.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 93e24dd..06e90f4 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -257,6 +257,7 @@ enum ix86_tune_indices {
   X86_TUNE_MOVE_M1_VIA_OR,
   X86_TUNE_NOT_UNPAIRABLE,
   X86_TUNE_NOT_VECTORMODE,
+  X86_USE_VECTOR_CONVERTS,
 
   X86_TUNE_LAST
 };
@@ -337,6 +338,7 @@ extern unsigned int ix86_tune_features[X86_TUNE_LAST];
 #define	TARGET_MOVE_M1_VIA_OR	ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR]
 #define TARGET_NOT_UNPAIRABLE	ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE]
 #define TARGET_NOT_VECTORMODE	ix86_tune_features[X86_TUNE_NOT_VECTORMODE]
+#define TARGET_USE_VECTOR_CONVERTS ix86_tune_features[X86_USE_VECTOR_CONVERTS]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 42b3bab..352f67d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3916,6 +3916,49 @@
     }
 })
 
+/* For converting SF(xmm2) to DF(xmm1), use the following code instead of
+   cvtss2sd:
+      unpcklps xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+      cvtps2pd xmm2,xmm1
+   We do the conversion post reload to avoid producing of 128bit spills
+   that might lead to ICE on 32bit target.  The sequence unlikely combine
+   anyway.  */
+(define_split
+  [(set (match_operand:DF 0 "register_operand" "")
+        (float_extend:DF
+	  (match_operand:SF 1 "nonimmediate_operand" "")))]
+  "TARGET_USE_VECTOR_CONVERTS && !optimize_size 
+   && reload_completed && SSE_REG_P (operands[0])"
+   [(set (match_dup 2)
+	 (float_extend:V2DF
+	   (vec_select:V2SF
+	     (match_dup 3)
+	     (parallel [(const_int 0) (const_int 1)]))))]
+{
+  operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+  operands[3] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0);
+  /* Use movss for loading from memory, unpcklps reg, reg for registers.
+     Try to avoid move when unpacking can be done in source.  */
+  if (REG_P (operands[1]))
+    {
+      /* If it is unsafe to overwrite upper half of source, we need
+	 to move to destination and unpack there.  */
+      if ((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+	   || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
+	  && true_regnum (operands[0]) != true_regnum (operands[1]))
+	{
+	  rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+	  emit_move_insn (tmp, operands[1]);
+	}
+      else
+	operands[3] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+      emit_insn (gen_sse_unpcklps (operands[3], operands[3], operands[3]));
+    }
+  else
+    emit_insn (gen_vec_setv4sf_0 (operands[3], 
+				  CONST0_RTX (V4SFmode), operands[1]));
+})
+
 (define_insn "*extendsfdf2_mixed"
   [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
         (float_extend:DF
@@ -4009,6 +4052,51 @@
     }
 })
 
+/* For converting DF(xmm2) to SF(xmm1), use the following code instead of
+   cvtsd2ss:
+      unpcklpd xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+      cvtpd2ps xmm2,xmm1
+   We do the conversion post reload to avoid producing of 128bit spills
+   that might lead to ICE on 32bit target.  The sequence unlikely combine
+   anyway.  */
+(define_split
+  [(set (match_operand:SF 0 "register_operand" "")
+        (float_truncate:SF
+	  (match_operand:DF 1 "nonimmediate_operand" "")))]
+  "TARGET_USE_VECTOR_CONVERTS && !optimize_size 
+   && reload_completed && SSE_REG_P (operands[0])"
+   [(set (match_dup 2)
+	 (vec_concat:V4SF
+	   (float_truncate:V2SF
+	     (match_dup 4))
+	   (match_dup 3)))]
+{
+  operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+  operands[3] = CONST0_RTX (V2SFmode);
+  operands[4] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0);
+  /* Use movsd for loading from memory, unpcklpd for registers.
+     Try to avoid move when unpacking can be done in source, or SSE3
+     movddup is available.  */
+  if (REG_P (operands[1]))
+    {
+      if (!TARGET_SSE3
+	  && true_regnum (operands[0]) != true_regnum (operands[1])
+	  && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+	      || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+	{
+	  rtx tmp = simplify_gen_subreg (DFmode, operands[0], SFmode, 0);
+	  emit_move_insn (tmp, operands[1]);
+	  operands[1] = tmp;
+	}
+      else if (!TARGET_SSE3)
+	operands[4] = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0);
+      emit_insn (gen_vec_dupv2df (operands[4], operands[1]));
+    }
+  else
+    emit_insn (gen_sse2_loadlpd (operands[4],
+				 CONST0_RTX (V2DFmode), operands[1]));
+})
+
 (define_expand "truncdfsf2_with_temp"
   [(parallel [(set (match_operand:SF 0 "" "")
 		   (float_truncate:SF (match_operand:DF 1 "" "")))
@@ -4685,12 +4773,67 @@
   [(set (match_operand:MODEF 0 "register_operand" "")
 	(float:MODEF (match_operand:SI 1 "nonimmediate_operand" "")))]
   "TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
-  "")
+  "
+   /* When we use vector converts, we can't have input in memory.  */
+   if (GET_MODE (operands[0]) == DFmode && GET_MODE (operands[1]) == SImode
+       && TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH
+       && SSE_FLOAT_MODE_P (DFmode))
+     operands[1] = force_reg (SImode, operands[1]);
+   
+   if (GET_MODE (operands[0]) == SFmode && GET_MODE (operands[1]) == SImode
+       && !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH
+       && SSE_FLOAT_MODE_P (SFmode))
+     {
+       /* When !flag_trapping_math, we handle SImode->SFmode vector
+	  conversions same way as SImode->DFmode.
+
+	  For flat_trapping_math we can't safely use vector conversion without
+	  clearing upper half, otherwise precision exception might occur.
+	  However we can still generate the common sequence converting value
+	  from general register to XMM register as:
+
+	    mov 	reg32, mem32
+	    movd	mem32, xmm
+	    cvtdq2pd xmm,xmm
+
+	  because we know that movd clears the upper half.
+
+	  Sadly in this case we can't rely on reload moving the value to XMM
+	  register, since we need to know if upper half is OK, so we need
+	  to do reloading by hand.  We force operand to memory unless target
+	  supports inter unit moves.  */
+       if (!flag_trapping_math)
+         operands[1] = force_reg (SImode, operands[1]);
+       else if (!MEM_P (operands[1]))
+	 {
+	   rtx tmp = assign_386_stack_local (SImode, SLOT_VIRTUAL);
+	   emit_move_insn (tmp, operands[1]);
+	   operands[1] = tmp;
+	 }
+     }
+  ")
+
+(define_insn "*floatsisf2_mixed_vector"
+  [(set (match_operand:SF 0 "register_operand" "=x,f,?f")
+	(float:SF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+  "TARGET_MIX_SSE_I387 && !flag_trapping_math 
+   && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+  "@
+   cvtpq2ps\t{%1, %0|%0, %1}
+   fild%z1\t%1
+   #"
+  [(set_attr "type" "sseicvt,fmov,multi")
+   (set_attr "mode" "SF")
+   (set_attr "unit" "*,i387,*")
+   (set_attr "athlon_decode" "double,*,*")
+   (set_attr "amdfam10_decode" "double,*,*")
+   (set_attr "fp_int_src" "false,true,true")])
 
 (define_insn "*floatsisf2_mixed"
   [(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
 	(float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
-  "TARGET_MIX_SSE_I387"
+  "TARGET_MIX_SSE_I387
+   && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
   "@
    fild%z1\t%1
    #
@@ -4703,10 +4846,68 @@
    (set_attr "amdfam10_decode" "*,*,vector,double")
    (set_attr "fp_int_src" "true")])
 
+(define_insn "*floatsisf2_sse_vector_nointernunit"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+	(float:SF (match_operand:SI 1 "memory_operand" "m")))]
+  "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+   && !TARGET_INTER_UNIT_MOVES"
+  "#"
+  [(set_attr "type" "multi")])
+
+(define_insn "*floatsisf2_sse_vector_internunit"
+  [(set (match_operand:SF 0 "register_operand" "=x,x")
+	(float:SF (match_operand:SI 1 "nonimmediate_operand" "rm,x")))]
+  "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+   && TARGET_INTER_UNIT_MOVES"
+  "#"
+  [(set_attr "type" "multi")])
+
+(define_split 
+  [(set (match_operand:SF 0 "register_operand" "")
+	(float:SF (match_operand:SI 1 "nonimmediate_operand" "")))]
+  "flag_trapping_math
+   && TARGET_USE_VECTOR_CONVERTS && reload_completed
+   && (TARGET_INTER_UNIT_MOVES || MEM_P (operands[1]))
+   && !SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+	(float:V4SF (match_dup 2)))]
+{
+  operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+  operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+  emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
+(define_split 
+  [(set (match_operand:SF 0 "register_operand" "")
+	(float:SF (match_operand:SI 1 "register_operand" "")))]
+  "flag_trapping_math
+   && TARGET_USE_VECTOR_CONVERTS && reload_completed
+   && SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+  [(set (match_dup 2) (vec_duplicate:V4SI (match_dup 1)))
+   (set (match_dup 0)
+	(float:V4SF (match_dup 2)))]
+{
+  operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+  operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+})
+
+(define_insn "*floatsisf2_sse_vector"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+	(float:SF (match_operand:SI 1 "register_operand" "x")))]
+  "!flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+   && !TARGET_INTER_UNIT_MOVES"
+  "cvtpq2ps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "mode" "SF")
+   (set_attr "athlon_decode" "double")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "fp_int_src" "true")])
+
 (define_insn "*floatsisf2_sse"
   [(set (match_operand:SF 0 "register_operand" "=x,x")
 	(float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
-  "TARGET_SSE_MATH"
+  "TARGET_SSE_MATH
+   && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
   "cvtsi2ss\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "mode" "SF")
@@ -4714,38 +4915,89 @@
    (set_attr "amdfam10_decode" "vector,double")
    (set_attr "fp_int_src" "true")])
 
+(define_insn "*floatsidf2_mixed_vector"
+  [(set (match_operand:DF 0 "register_operand" "=x,f,f")
+	(float:DF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+  "TARGET_SSE2 && TARGET_MIX_SSE_I387
+   && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+  "@
+   cvtdq2pd\t{%1, %0|%0, %1}
+   fild%z1\t%1
+   #"
+  [(set_attr "type" "sseicvt,fmov,multi")
+   (set_attr "mode" "V2DF,DF,DF")
+   (set_attr "unit" "*,*,i387")
+   (set_attr "athlon_decode" "double,*,*")
+   (set_attr "amdfam10_decode" "double,*,*")
+   (set_attr "fp_int_src" "false,true,true")])
+
 (define_insn "*floatsidf2_mixed"
-  [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x")
-	(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
-  "TARGET_SSE2 && TARGET_MIX_SSE_I387"
+  [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x")
+	(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))]
+  "TARGET_SSE2 && TARGET_MIX_SSE_I387
+    && (!TARGET_USE_VECTOR_CONVERTS || !optimize_size)"
   "@
    fild%z1\t%1
    #
    cvtsi2sd\t{%1, %0|%0, %1}
-   cvtsi2sd\t{%1, %0|%0, %1}"
-  [(set_attr "type" "fmov,multi,sseicvt,sseicvt")
-   (set_attr "mode" "DF")
-   (set_attr "unit" "*,i387,*,*")
-   (set_attr "athlon_decode" "*,*,double,direct")
-   (set_attr "amdfam10_decode" "*,*,vector,double")
+   cvtsi2sd\t{%1, %0|%0, %1}
+   cvtdq2pd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "fmov,multi,sseicvt,sseicvt,sseicvt")
+   (set_attr "mode" "DF,DF,DF,DF,V2DF")
+   (set_attr "unit" "*,i387,*,*,*")
+   (set_attr "athlon_decode" "*,*,double,direct,double")
+   (set_attr "amdfam10_decode" "*,*,vector,double,double")
+   (set_attr "fp_int_src" "true,true,true,true,false")])
+
+(define_insn "*floatsidf2_sse_vector"
+  [(set (match_operand:DF 0 "register_operand" "=x")
+	(float:DF (match_operand:SI 1 "register_operand" "x")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+  "cvtdq2pd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "mode" "V2DF")
+   (set_attr "athlon_decode" "double")
+   (set_attr "amdfam10_decode" "double")
    (set_attr "fp_int_src" "true")])
 
+(define_split 
+  [(set (match_operand:DF 0 "register_operand" "")
+	(float:DF (match_operand:SI 1 "memory_operand" "")))]
+  "TARGET_USE_VECTOR_CONVERTS && reload_completed
+   && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+	(float:V2DF
+	  (vec_select:V2SI
+	    (match_dup 2)
+	    (parallel [(const_int 0) (const_int 1)]))))]
+{
+  operands[2] = simplify_gen_subreg (V4SImode, operands[0], DFmode, 0);
+  operands[0] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+  emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
 (define_insn "*floatsidf2_sse"
-  [(set (match_operand:DF 0 "register_operand" "=x,x")
-	(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH"
-  "cvtsi2sd\t{%1, %0|%0, %1}"
+  [(set (match_operand:DF 0 "register_operand" "=x,x,!x")
+	(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
+  "@
+   cvtsi2sd\t{%1, %0|%0, %1} 
+   cvtsi2sd\t{%1, %0|%0, %1} 
+   cvtdq2pd\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
-   (set_attr "mode" "DF")
-   (set_attr "athlon_decode" "double,direct")
-   (set_attr "amdfam10_decode" "vector,double")
+   (set_attr "mode" "DF,DF,V2DF")
+   (set_attr "athlon_decode" "double,direct,double")
+   (set_attr "amdfam10_decode" "vector,double,double")
    (set_attr "fp_int_src" "true")])
 
 (define_insn "*floatsi<mode>2_i387"
   [(set (match_operand:MODEF 0 "register_operand" "=f,f")
 	(float:MODEF
 	  (match_operand:SI 1 "nonimmediate_operand" "m,?r")))]
-  "TARGET_80387"
+  "TARGET_80387
+   && (!TARGET_SSE_MATH || !SSE_FLOAT_MODE_P (GET_MODE (operands[0])))"
   "@
    fild%z1\t%1
    #"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index cb63ab9..03b2577 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2740,7 +2740,7 @@
   [(set_attr "type" "sselog1")
    (set_attr "mode" "DF")])
 
-(define_insn "*vec_dupv2df"
+(define_insn "vec_dupv2df"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(vec_duplicate:V2DF
 	  (match_operand:DF 1 "register_operand" "0")))]