diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2021-07-19 10:19:30 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2021-08-04 16:58:26 +0100 |
commit | 63834c84d43fc2eeeaa054c5e24d1e468e9eddab (patch) | |
tree | ea722026fde334209daef1e4740b81cfb4b17ebd | |
parent | 1d65c9d25199264bc8909018df1b0dca71c0b32d (diff) | |
download | gcc-63834c84d43fc2eeeaa054c5e24d1e468e9eddab.zip gcc-63834c84d43fc2eeeaa054c5e24d1e468e9eddab.tar.gz gcc-63834c84d43fc2eeeaa054c5e24d1e468e9eddab.tar.bz2 |
aarch64: Don't include vec_select high-half in SIMD multiply cost
The Neon multiply/multiply-accumulate/multiply-subtract instructions
can select the top or bottom half of the operand registers. This
selection does not change the cost of the underlying instruction and
this should be reflected by the RTL cost function.
This patch adds RTL tree traversal in the Neon multiply cost function
to match vec_select high-half of its operands. This traversal
prevents the cost of the vec_select from being added into the cost of
the multiply - meaning that these instructions can now be emitted in
the combine pass as they are no longer deemed prohibitively
expensive.
gcc/ChangeLog:
2021-07-19 Jonathan Wright <jonathan.wright@arm.com>
* config/aarch64/aarch64.c (aarch64_strip_extend_vec_half):
Define.
(aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
vec_select high-half from being added into Neon multiply
cost.
* rtlanal.c (vec_series_highpart_p): Define.
* rtlanal.h (vec_series_highpart_p): Declare.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vmul_high_cost.c: New test.
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 22 | ||||
-rw-r--r-- | gcc/rtlanal.c | 19 | ||||
-rw-r--r-- | gcc/rtlanal.h | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c | 85 |
4 files changed, 130 insertions, 0 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 23829bb..e02cbcb 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -78,6 +78,7 @@ #include "gimple-pretty-print.h" #include "tree-ssa-loop-niter.h" #include "fractional-cost.h" +#include "rtlanal.h" /* This file should be included last. */ #include "target-def.h" @@ -12046,6 +12047,22 @@ aarch64_strip_extend (rtx x, bool strip_shift) return x; } +/* Helper function for rtx cost calculation. Strip extension as well as any + inner VEC_SELECT high-half from X. Returns the inner vector operand if + successful, or the original expression on failure. */ +static rtx +aarch64_strip_extend_vec_half (rtx x) +{ + if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND) + { + x = XEXP (x, 0); + if (GET_CODE (x) == VEC_SELECT + && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)), + XEXP (x, 1))) + x = XEXP (x, 0); + } + return x; +} /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as any subsequent extend and VEC_SELECT from X. Returns the inner scalar @@ -12133,6 +12150,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) unsigned int vec_flags = aarch64_classify_vector_mode (mode); if (vec_flags & VEC_ADVSIMD) { + /* The select-operand-high-half versions of the instruction have the + same cost as the three vector version - don't add the costs of the + extension or selection into the costs of the multiply. */ + op0 = aarch64_strip_extend_vec_half (op0); + op1 = aarch64_strip_extend_vec_half (op1); /* The by-element versions of the instruction have the same costs as the normal 3-vector version. We make an assumption that the input to the VEC_DUPLICATE is already on the FP & SIMD side. This means diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c index f7f3acb..d37f778 100644 --- a/gcc/rtlanal.c +++ b/gcc/rtlanal.c @@ -6957,6 +6957,25 @@ register_asm_p (const_rtx x) (vec_select:RESULT_MODE OP SEL) + is equivalent to the highpart RESULT_MODE of OP. */ + +bool +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel) +{ + int nunits; + if (GET_MODE_NUNITS (op_mode).is_constant (&nunits) + && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS)) + { + int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0); + return rtvec_series_p (XVEC (sel, 0), offset); + } + return false; +} + +/* Return true if, for all OP of mode OP_MODE: + + (vec_select:RESULT_MODE OP SEL) + is equivalent to the lowpart RESULT_MODE of OP. */ bool diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h index e164242..542dc78 100644 --- a/gcc/rtlanal.h +++ b/gcc/rtlanal.h @@ -332,6 +332,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base () using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>; bool +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, + rtx sel); + +bool vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel); #endif diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c new file mode 100644 index 0000000..ecc02e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c @@ -0,0 +1,85 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +#include <arm_neon.h> + +#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \ + rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \ + { \ + rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \ + vget_high_ ## ts (c)); \ + rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \ + vget_high_ ## ts (c)); \ + return vqaddq ## _ ## rs (t0, t1); \ + } + +TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16) +TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16) +TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32) +TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32) +TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64) +TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64) + +TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32) +TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64) + +#define TEST_MULL_N(name, rettype, intype, ts, rs) \ + rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \ + { \ + rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \ + rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \ + return vqaddq ## _ ## rs (t0, t1); \ + } + +TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32) +TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32) +TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64) +TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64) + +TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32) +TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64) + +#define TEST_MLXL_VEC(name, rettype, intype, ts) \ + rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \ + intype c) \ + { \ + acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \ + vget_high_ ## ts (b)); \ + return name ## _ ## ts (acc, vget_high_ ## ts (a), \ + vget_high_ ## ts (c)); \ + } + +TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8) +TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8) +TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16) +TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16) + +TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8) +TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8) +TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16) +TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16) + +#define TEST_MLXL_N(name, rettype, intype, ts) \ + rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \ + { \ + acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \ + return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \ + } + +TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16) +TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16) +TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32) +TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32) + +TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16) +TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16) +TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32) +TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32) + +TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16) +TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32) + +TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16) +TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32) + +/* { dg-final { scan-assembler-not "dup\\t" } } */ |