diff options
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 18 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c | 21 |
2 files changed, 38 insertions, 1 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 973c65a..f3551a7 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -11279,7 +11279,23 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) op1 = XEXP (x, 1); if (VECTOR_MODE_P (mode)) - mode = GET_MODE_INNER (mode); + { + unsigned int vec_flags = aarch64_classify_vector_mode (mode); + mode = GET_MODE_INNER (mode); + if (vec_flags & VEC_ADVSIMD) + { + /* The by-element versions of the instruction have the same costs as + the normal 3-vector version. So don't add the costs of the + duplicate into the costs of the multiply. We make an assumption + that the input to the VEC_DUPLICATE is already on the FP & SIMD + side. This means costing of a MUL by element pre RA is a bit + optimistic. */ + if (GET_CODE (op0) == VEC_DUPLICATE) + op0 = XEXP (op0, 0); + else if (GET_CODE (op1) == VEC_DUPLICATE) + op1 = XEXP (op1, 0); + } + } /* Integer multiply/fma. */ if (GET_MODE_CLASS (mode) == MODE_INT) diff --git a/gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c b/gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c new file mode 100644 index 0000000..513721c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_float } */ +/* { dg-options "-Ofast" } */ + +#include <arm_neon.h> + +void s_mult_i (int32_t* restrict res, int32_t* restrict a, int32_t b) +{ + for (int x = 0; x < 16; x++) + res[x] = a[x] * b; +} + +void s_mult_f (float32_t* restrict res, float32_t* restrict a, float32_t b) +{ + for (int x = 0; x < 16; x++) + res[x] = a[x] * b; +} + +/* { dg-final { scan-assembler-times {\s+mul\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.s\[0\]} 4 } } */ +/* { dg-final { scan-assembler-times {\s+fmul\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.s\[0\]} 4 } } */ |