diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/i386.cc | 95 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr89618-2.c | 8 |
2 files changed, 92 insertions, 11 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index d15f91d..aef4145 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -25300,7 +25300,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, else if (X87_FLOAT_MODE_P (mode)) stmt_cost = ix86_cost->fadd; else - stmt_cost = ix86_cost->add; + stmt_cost = ix86_cost->add; } else stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss @@ -25355,7 +25355,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, (subcode == RSHIFT_EXPR && !TYPE_UNSIGNED (TREE_TYPE (op1))) ? ASHIFTRT : LSHIFTRT, mode, - TREE_CODE (op2) == INTEGER_CST, + TREE_CODE (op2) == INTEGER_CST, cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1, false, false, NULL, NULL); @@ -25364,7 +25364,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, case NOP_EXPR: /* Only sign-conversions are free. */ if (tree_nop_conversion_p - (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), + (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)))) stmt_cost = 0; else if (fp) @@ -25372,17 +25372,94 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, (ix86_tune_cost, GET_MODE_BITSIZE (mode)); break; - case BIT_IOR_EXPR: - case ABS_EXPR: - case ABSU_EXPR: + case COND_EXPR: + { + /* SSE2 conditinal move sequence is: + pcmpgtd %xmm5, %xmm0 + pand %xmm0, %xmm2 + pandn %xmm1, %xmm0 + por %xmm2, %xmm0 + while SSE4 uses cmp + blend + and AVX512 masked moves. */ + + int ninsns = TARGET_SSE4_1 ? 2 : 4; + + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + stmt_cost = ninsns * ix86_cost->sse_op; + else if (X87_FLOAT_MODE_P (mode)) + /* x87 requires conditional branch. We don't have cost for + that. */ + ; + else if (VECTOR_MODE_P (mode)) + stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op); + else + /* compare + cmov. */ + stmt_cost = ix86_cost->add * 2; + } + break; + case MIN_EXPR: case MAX_EXPR: + if (fp) + { + if (X87_FLOAT_MODE_P (mode)) + /* x87 requires conditional branch. We don't have cost for + that. */ + ; + else + /* minss */ + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + } + else + { + if (VECTOR_MODE_P (mode)) + { + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + /* vpmin was introduced in SSE3. + SSE2 needs pcmpgtd + pand + pandn + pxor. */ + if (!TARGET_SSSE3) + stmt_cost *= 4; + } + else + /* cmp + cmov. */ + stmt_cost = ix86_cost->add * 2; + } + break; + + case ABS_EXPR: + case ABSU_EXPR: + if (fp) + { + if (X87_FLOAT_MODE_P (mode)) + /* fabs. */ + stmt_cost = ix86_cost->fabs; + else + /* andss of sign bit. */ + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + } + else + { + if (VECTOR_MODE_P (mode)) + { + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + /* vabs was introduced in SSE3. + SSE3 uses psrat + pxor + psub. */ + if (!TARGET_SSSE3) + stmt_cost *= 3; + } + else + /* neg + cmov. */ + stmt_cost = ix86_cost->add * 2; + } + break; + + case BIT_IOR_EXPR: case BIT_XOR_EXPR: case BIT_AND_EXPR: case BIT_NOT_EXPR: - if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) - stmt_cost = ix86_cost->sse_op; - else if (VECTOR_MODE_P (mode)) + gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode) + && !X87_FLOAT_MODE_P (mode)); + if (VECTOR_MODE_P (mode)) stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); else stmt_cost = ix86_cost->add; diff --git a/gcc/testsuite/gcc.target/i386/pr89618-2.c b/gcc/testsuite/gcc.target/i386/pr89618-2.c index c414053..11d658f 100644 --- a/gcc/testsuite/gcc.target/i386/pr89618-2.c +++ b/gcc/testsuite/gcc.target/i386/pr89618-2.c @@ -19,5 +19,9 @@ void foo (int n, int *off, double *a) } /* Make sure the cost model selects SSE vectors rather than AVX to avoid - too many scalar ops for the address computes in the loop. */ -/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" { target { ! ia32 } } } } */ + too many scalar ops for the address computes in the loop. + + Since open-coded scatters are costed wrong, we no longer vectorize after fixing + COND_EXPR costs. See PR119902. */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" { target { ! ia32 } xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-not "loop vectorized using 32 byte vectors" "vect" { target { ! ia32 } } } } */ |