From 4845dbb50ed31ad03c579364e4b70bbe90e7af99 Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Sun, 9 Sep 2007 19:39:28 +0200 Subject: i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS. * i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS. (TARGET_USE_VECTOR_CONVERTS): New. * i386.md: New post-reload splitters for converting SF to DF and DF to SF. (floatsi* expander): Special case vector conversions. (floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit, floatsisf2_sse_vector_internunit, floatsisf2_sse_vector, floatsidf2_mixed_vector, floatsidf2_sse_vector): New. (floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse): Disable when doing vector converts. (floatsi2_i387): Disable when * sse.md (vec_dupv2df): Export. * i386.c (ix86_tune_features): Enable SSE conversions. Co-Authored-By: Dwarakanath Rajagopal From-SVN: r128301 --- gcc/ChangeLog | 17 +++ gcc/config/i386/i386.c | 4 + gcc/config/i386/i386.h | 2 + gcc/config/i386/i386.md | 292 ++++++++++++++++++++++++++++++++++++++++++++---- gcc/config/i386/sse.md | 2 +- 5 files changed, 296 insertions(+), 21 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ff54c46..d748be2 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,20 @@ +2007-09-09 Jan Hubicka + Dwarakanath Rajagopal + + * i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS. + (TARGET_USE_VECTOR_CONVERTS): New. + * i386.md: New post-reload splitters for converting SF to DF and DF to + SF. + (floatsi* expander): Special case vector conversions. + (floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit, + floatsisf2_sse_vector_internunit, floatsisf2_sse_vector, + floatsidf2_mixed_vector, floatsidf2_sse_vector): New. + (floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse): + Disable when doing vector converts. + (floatsi2_i387): Disable when + * sse.md (vec_dupv2df): Export. + * i386.c (ix86_tune_features): Enable SSE conversions. + 2007-09-09 Richard Guenther * tree-ssa-operands.c (add_virtual_operand): Only mark diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index f6f80a0..c01198b 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1258,6 +1258,10 @@ unsigned int ix86_tune_features[X86_TUNE_LAST] = { operand that cannot be represented using a modRM byte. The XOR replacement is long decoded, so this split helps here as well. */ m_K6, + + /* X86_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion from + integer to FP. */ + m_AMDFAM10, }; /* Feature tests against the various architecture variations. */ diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 93e24dd..06e90f4 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -257,6 +257,7 @@ enum ix86_tune_indices { X86_TUNE_MOVE_M1_VIA_OR, X86_TUNE_NOT_UNPAIRABLE, X86_TUNE_NOT_VECTORMODE, + X86_USE_VECTOR_CONVERTS, X86_TUNE_LAST }; @@ -337,6 +338,7 @@ extern unsigned int ix86_tune_features[X86_TUNE_LAST]; #define TARGET_MOVE_M1_VIA_OR ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR] #define TARGET_NOT_UNPAIRABLE ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE] #define TARGET_NOT_VECTORMODE ix86_tune_features[X86_TUNE_NOT_VECTORMODE] +#define TARGET_USE_VECTOR_CONVERTS ix86_tune_features[X86_USE_VECTOR_CONVERTS] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 42b3bab..352f67d 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -3916,6 +3916,49 @@ } }) +/* For converting SF(xmm2) to DF(xmm1), use the following code instead of + cvtss2sd: + unpcklps xmm2,xmm2 ; packed conversion might crash on signaling NaNs + cvtps2pd xmm2,xmm1 + We do the conversion post reload to avoid producing of 128bit spills + that might lead to ICE on 32bit target. The sequence unlikely combine + anyway. */ +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (float_extend:DF + (match_operand:SF 1 "nonimmediate_operand" "")))] + "TARGET_USE_VECTOR_CONVERTS && !optimize_size + && reload_completed && SSE_REG_P (operands[0])" + [(set (match_dup 2) + (float_extend:V2DF + (vec_select:V2SF + (match_dup 3) + (parallel [(const_int 0) (const_int 1)]))))] +{ + operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0); + operands[3] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0); + /* Use movss for loading from memory, unpcklps reg, reg for registers. + Try to avoid move when unpacking can be done in source. */ + if (REG_P (operands[1])) + { + /* If it is unsafe to overwrite upper half of source, we need + to move to destination and unpack there. */ + if ((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER + || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4) + && true_regnum (operands[0]) != true_regnum (operands[1])) + { + rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0])); + emit_move_insn (tmp, operands[1]); + } + else + operands[3] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0); + emit_insn (gen_sse_unpcklps (operands[3], operands[3], operands[3])); + } + else + emit_insn (gen_vec_setv4sf_0 (operands[3], + CONST0_RTX (V4SFmode), operands[1])); +}) + (define_insn "*extendsfdf2_mixed" [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x") (float_extend:DF @@ -4009,6 +4052,51 @@ } }) +/* For converting DF(xmm2) to SF(xmm1), use the following code instead of + cvtsd2ss: + unpcklpd xmm2,xmm2 ; packed conversion might crash on signaling NaNs + cvtpd2ps xmm2,xmm1 + We do the conversion post reload to avoid producing of 128bit spills + that might lead to ICE on 32bit target. The sequence unlikely combine + anyway. */ +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "")))] + "TARGET_USE_VECTOR_CONVERTS && !optimize_size + && reload_completed && SSE_REG_P (operands[0])" + [(set (match_dup 2) + (vec_concat:V4SF + (float_truncate:V2SF + (match_dup 4)) + (match_dup 3)))] +{ + operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0); + operands[3] = CONST0_RTX (V2SFmode); + operands[4] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0); + /* Use movsd for loading from memory, unpcklpd for registers. + Try to avoid move when unpacking can be done in source, or SSE3 + movddup is available. */ + if (REG_P (operands[1])) + { + if (!TARGET_SSE3 + && true_regnum (operands[0]) != true_regnum (operands[1]) + && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER + || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8)) + { + rtx tmp = simplify_gen_subreg (DFmode, operands[0], SFmode, 0); + emit_move_insn (tmp, operands[1]); + operands[1] = tmp; + } + else if (!TARGET_SSE3) + operands[4] = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0); + emit_insn (gen_vec_dupv2df (operands[4], operands[1])); + } + else + emit_insn (gen_sse2_loadlpd (operands[4], + CONST0_RTX (V2DFmode), operands[1])); +}) + (define_expand "truncdfsf2_with_temp" [(parallel [(set (match_operand:SF 0 "" "") (float_truncate:SF (match_operand:DF 1 "" ""))) @@ -4685,12 +4773,67 @@ [(set (match_operand:MODEF 0 "register_operand" "") (float:MODEF (match_operand:SI 1 "nonimmediate_operand" "")))] "TARGET_80387 || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" - "") + " + /* When we use vector converts, we can't have input in memory. */ + if (GET_MODE (operands[0]) == DFmode && GET_MODE (operands[1]) == SImode + && TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH + && SSE_FLOAT_MODE_P (DFmode)) + operands[1] = force_reg (SImode, operands[1]); + + if (GET_MODE (operands[0]) == SFmode && GET_MODE (operands[1]) == SImode + && !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH + && SSE_FLOAT_MODE_P (SFmode)) + { + /* When !flag_trapping_math, we handle SImode->SFmode vector + conversions same way as SImode->DFmode. + + For flat_trapping_math we can't safely use vector conversion without + clearing upper half, otherwise precision exception might occur. + However we can still generate the common sequence converting value + from general register to XMM register as: + + mov reg32, mem32 + movd mem32, xmm + cvtdq2pd xmm,xmm + + because we know that movd clears the upper half. + + Sadly in this case we can't rely on reload moving the value to XMM + register, since we need to know if upper half is OK, so we need + to do reloading by hand. We force operand to memory unless target + supports inter unit moves. */ + if (!flag_trapping_math) + operands[1] = force_reg (SImode, operands[1]); + else if (!MEM_P (operands[1])) + { + rtx tmp = assign_386_stack_local (SImode, SLOT_VIRTUAL); + emit_move_insn (tmp, operands[1]); + operands[1] = tmp; + } + } + ") + +(define_insn "*floatsisf2_mixed_vector" + [(set (match_operand:SF 0 "register_operand" "=x,f,?f") + (float:SF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))] + "TARGET_MIX_SSE_I387 && !flag_trapping_math + && TARGET_USE_VECTOR_CONVERTS && !optimize_size" + "@ + cvtpq2ps\t{%1, %0|%0, %1} + fild%z1\t%1 + #" + [(set_attr "type" "sseicvt,fmov,multi") + (set_attr "mode" "SF") + (set_attr "unit" "*,i387,*") + (set_attr "athlon_decode" "double,*,*") + (set_attr "amdfam10_decode" "double,*,*") + (set_attr "fp_int_src" "false,true,true")]) (define_insn "*floatsisf2_mixed" [(set (match_operand:SF 0 "register_operand" "=f,?f,x,x") (float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))] - "TARGET_MIX_SSE_I387" + "TARGET_MIX_SSE_I387 + && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)" "@ fild%z1\t%1 # @@ -4703,10 +4846,68 @@ (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) +(define_insn "*floatsisf2_sse_vector_nointernunit" + [(set (match_operand:SF 0 "register_operand" "=x") + (float:SF (match_operand:SI 1 "memory_operand" "m")))] + "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size + && !TARGET_INTER_UNIT_MOVES" + "#" + [(set_attr "type" "multi")]) + +(define_insn "*floatsisf2_sse_vector_internunit" + [(set (match_operand:SF 0 "register_operand" "=x,x") + (float:SF (match_operand:SI 1 "nonimmediate_operand" "rm,x")))] + "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size + && TARGET_INTER_UNIT_MOVES" + "#" + [(set_attr "type" "multi")]) + +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (float:SF (match_operand:SI 1 "nonimmediate_operand" "")))] + "flag_trapping_math + && TARGET_USE_VECTOR_CONVERTS && reload_completed + && (TARGET_INTER_UNIT_MOVES || MEM_P (operands[1])) + && !SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])" + [(set (match_dup 0) + (float:V4SF (match_dup 2)))] +{ + operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0); + operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0); + emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1])); +}) + +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (float:SF (match_operand:SI 1 "register_operand" "")))] + "flag_trapping_math + && TARGET_USE_VECTOR_CONVERTS && reload_completed + && SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])" + [(set (match_dup 2) (vec_duplicate:V4SI (match_dup 1))) + (set (match_dup 0) + (float:V4SF (match_dup 2)))] +{ + operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0); + operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0); +}) + +(define_insn "*floatsisf2_sse_vector" + [(set (match_operand:SF 0 "register_operand" "=x") + (float:SF (match_operand:SI 1 "register_operand" "x")))] + "!flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size + && !TARGET_INTER_UNIT_MOVES" + "cvtpq2ps\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "SF") + (set_attr "athlon_decode" "double") + (set_attr "amdfam10_decode" "double") + (set_attr "fp_int_src" "true")]) + (define_insn "*floatsisf2_sse" [(set (match_operand:SF 0 "register_operand" "=x,x") (float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))] - "TARGET_SSE_MATH" + "TARGET_SSE_MATH + && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)" "cvtsi2ss\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "SF") @@ -4714,38 +4915,89 @@ (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) +(define_insn "*floatsidf2_mixed_vector" + [(set (match_operand:DF 0 "register_operand" "=x,f,f") + (float:DF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))] + "TARGET_SSE2 && TARGET_MIX_SSE_I387 + && TARGET_USE_VECTOR_CONVERTS && !optimize_size" + "@ + cvtdq2pd\t{%1, %0|%0, %1} + fild%z1\t%1 + #" + [(set_attr "type" "sseicvt,fmov,multi") + (set_attr "mode" "V2DF,DF,DF") + (set_attr "unit" "*,*,i387") + (set_attr "athlon_decode" "double,*,*") + (set_attr "amdfam10_decode" "double,*,*") + (set_attr "fp_int_src" "false,true,true")]) + (define_insn "*floatsidf2_mixed" - [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x") - (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))] - "TARGET_SSE2 && TARGET_MIX_SSE_I387" + [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x") + (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))] + "TARGET_SSE2 && TARGET_MIX_SSE_I387 + && (!TARGET_USE_VECTOR_CONVERTS || !optimize_size)" "@ fild%z1\t%1 # cvtsi2sd\t{%1, %0|%0, %1} - cvtsi2sd\t{%1, %0|%0, %1}" - [(set_attr "type" "fmov,multi,sseicvt,sseicvt") - (set_attr "mode" "DF") - (set_attr "unit" "*,i387,*,*") - (set_attr "athlon_decode" "*,*,double,direct") - (set_attr "amdfam10_decode" "*,*,vector,double") + cvtsi2sd\t{%1, %0|%0, %1} + cvtdq2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "fmov,multi,sseicvt,sseicvt,sseicvt") + (set_attr "mode" "DF,DF,DF,DF,V2DF") + (set_attr "unit" "*,i387,*,*,*") + (set_attr "athlon_decode" "*,*,double,direct,double") + (set_attr "amdfam10_decode" "*,*,vector,double,double") + (set_attr "fp_int_src" "true,true,true,true,false")]) + +(define_insn "*floatsidf2_sse_vector" + [(set (match_operand:DF 0 "register_operand" "=x") + (float:DF (match_operand:SI 1 "register_operand" "x")))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_USE_VECTOR_CONVERTS && !optimize_size" + "cvtdq2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "V2DF") + (set_attr "athlon_decode" "double") + (set_attr "amdfam10_decode" "double") (set_attr "fp_int_src" "true")]) +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (float:DF (match_operand:SI 1 "memory_operand" "")))] + "TARGET_USE_VECTOR_CONVERTS && reload_completed + && SSE_REG_P (operands[0])" + [(set (match_dup 0) + (float:V2DF + (vec_select:V2SI + (match_dup 2) + (parallel [(const_int 0) (const_int 1)]))))] +{ + operands[2] = simplify_gen_subreg (V4SImode, operands[0], DFmode, 0); + operands[0] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0); + emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1])); +}) + (define_insn "*floatsidf2_sse" - [(set (match_operand:DF 0 "register_operand" "=x,x") - (float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m")))] - "TARGET_SSE2 && TARGET_SSE_MATH" - "cvtsi2sd\t{%1, %0|%0, %1}" + [(set (match_operand:DF 0 "register_operand" "=x,x,!x") + (float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))] + "TARGET_SSE2 && TARGET_SSE_MATH + && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)" + "@ + cvtsi2sd\t{%1, %0|%0, %1} + cvtsi2sd\t{%1, %0|%0, %1} + cvtdq2pd\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") - (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,direct") - (set_attr "amdfam10_decode" "vector,double") + (set_attr "mode" "DF,DF,V2DF") + (set_attr "athlon_decode" "double,direct,double") + (set_attr "amdfam10_decode" "vector,double,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsi2_i387" [(set (match_operand:MODEF 0 "register_operand" "=f,f") (float:MODEF (match_operand:SI 1 "nonimmediate_operand" "m,?r")))] - "TARGET_80387" + "TARGET_80387 + && (!TARGET_SSE_MATH || !SSE_FLOAT_MODE_P (GET_MODE (operands[0])))" "@ fild%z1\t%1 #" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index cb63ab9..03b2577 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2740,7 +2740,7 @@ [(set_attr "type" "sselog1") (set_attr "mode" "DF")]) -(define_insn "*vec_dupv2df" +(define_insn "vec_dupv2df" [(set (match_operand:V2DF 0 "register_operand" "=x") (vec_duplicate:V2DF (match_operand:DF 1 "register_operand" "0")))] -- cgit v1.1