diff options
author | Tamar Christina <tamar.christina@arm.com> | 2021-12-02 14:39:22 +0000 |
---|---|---|
committer | Tamar Christina <tamar.christina@arm.com> | 2021-12-02 14:39:43 +0000 |
commit | 9b8830b6f3920b3ec6b9013230c687dc250bb6e9 (patch) | |
tree | 1e5af8440fa2c7ff97be56d2b10d7304084f38dc | |
parent | d47393d0b4d0d498795c4ae1353e6c156c1c4500 (diff) | |
download | gcc-9b8830b6f3920b3ec6b9013230c687dc250bb6e9.zip gcc-9b8830b6f3920b3ec6b9013230c687dc250bb6e9.tar.gz gcc-9b8830b6f3920b3ec6b9013230c687dc250bb6e9.tar.bz2 |
AArch64: Optimize right shift rounding narrowing
This optimizes right shift rounding narrow instructions to
rounding add narrow high where one vector is 0 when the shift amount is half
that of the original input type.
i.e.
uint32x4_t foo (uint64x2_t a, uint64x2_t b)
{
return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32);
}
now generates:
foo:
movi v3.4s, 0
raddhn v0.2s, v2.2d, v3.2d
raddhn2 v0.4s, v2.2d, v3.2d
instead of:
foo:
rshrn v0.2s, v0.2d, 32
rshrn2 v0.4s, v1.2d, 32
ret
On Arm cores this is an improvement in both latency and throughput.
Because a vector zero is needed I created a new method
aarch64_gen_shareable_zero that creates zeros using V4SI and then takes a subreg
of the zero to the desired type. This allows CSE to share all the zero
constants.
gcc/ChangeLog:
* config/aarch64/aarch64-protos.h (aarch64_gen_shareable_zero): New.
* config/aarch64/aarch64-simd.md (aarch64_rshrn<mode>,
aarch64_rshrn2<mode>): Generate rounding half-ing add when appropriate.
* config/aarch64/aarch64.c (aarch64_gen_shareable_zero): New.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/advsimd-intrinsics/shrn-1.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/shrn-2.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/shrn-3.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/shrn-4.c: New test.
-rw-r--r-- | gcc/config/aarch64/aarch64-protos.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 65 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 12 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c | 15 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c | 11 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c | 11 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c | 11 |
7 files changed, 106 insertions, 20 deletions
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index f7887d0..f7f5cae 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -846,6 +846,7 @@ const char *aarch64_output_move_struct (rtx *operands); rtx aarch64_return_addr_rtx (void); rtx aarch64_return_addr (int, rtx); rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT); +rtx aarch64_gen_shareable_zero (machine_mode); bool aarch64_simd_mem_operand_p (rtx); bool aarch64_sve_ld1r_operand_p (rtx); bool aarch64_sve_ld1rq_operand_p (rtx); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 8e61dd9..175a9f0 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1956,20 +1956,32 @@ (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")] "TARGET_SIMD" { - operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode, - INTVAL (operands[2])); - rtx tmp = gen_reg_rtx (<VNARROWQ2>mode); - if (BYTES_BIG_ENDIAN) - emit_insn (gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1], - operands[2], CONST0_RTX (<VNARROWQ>mode))); + if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) + { + rtx tmp0 = aarch64_gen_shareable_zero (<MODE>mode); + emit_insn (gen_aarch64_raddhn<mode> (operands[0], operands[1], tmp0)); + } else - emit_insn (gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1], - operands[2], CONST0_RTX (<VNARROWQ>mode))); - - /* The intrinsic expects a narrow result, so emit a subreg that will get - optimized away as appropriate. */ - emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp, - <VNARROWQ2>mode)); + { + rtx tmp = gen_reg_rtx (<VNARROWQ2>mode); + operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode, + INTVAL (operands[2])); + if (BYTES_BIG_ENDIAN) + emit_insn ( + gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1], + operands[2], + CONST0_RTX (<VNARROWQ>mode))); + else + emit_insn ( + gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1], + operands[2], + CONST0_RTX (<VNARROWQ>mode))); + + /* The intrinsic expects a narrow result, so emit a subreg that will + get optimized away as appropriate. */ + emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp, + <VNARROWQ2>mode)); + } DONE; } ) @@ -2049,14 +2061,27 @@ (match_operand:SI 3 "aarch64_simd_shift_imm_offset_<vn_mode>")] "TARGET_SIMD" { - operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode, - INTVAL (operands[3])); - if (BYTES_BIG_ENDIAN) - emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0], operands[1], - operands[2], operands[3])); + if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ2>mode)) + { + rtx tmp = aarch64_gen_shareable_zero (<MODE>mode); + emit_insn (gen_aarch64_raddhn2<mode> (operands[0], operands[1], + operands[2], tmp)); + } else - emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0], operands[1], - operands[2], operands[3])); + { + operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode, + INTVAL (operands[3])); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0], + operands[1], + operands[2], + operands[3])); + else + emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0], + operands[1], + operands[2], + operands[3])); + } DONE; } ) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 7389b59..be24b73 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -20414,6 +20414,18 @@ aarch64_mov_operand_p (rtx x, machine_mode mode) == SYMBOL_TINY_ABSOLUTE; } +/* Create a 0 constant that is based on V4SI to allow CSE to optimally share + the constant creation. */ + +rtx +aarch64_gen_shareable_zero (machine_mode mode) +{ + machine_mode zmode = V4SImode; + rtx tmp = gen_reg_rtx (zmode); + emit_move_insn (tmp, CONST0_RTX (zmode)); + return lowpart_subreg (mode, tmp, zmode); +} + /* Return a const_int vector of VAL. */ rtx aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c new file mode 100644 index 0000000..4bc3aa9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c @@ -0,0 +1,15 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */ + +#include <arm_neon.h> + +uint8x16_t foo (uint32x4_t a, uint32x4_t b) +{ + uint16x4_t a1 = vrshrn_n_u32 (a, 16); + uint16x8_t b1 = vrshrn_high_n_u32 (a1, b, 16); + return vrshrn_high_n_u16 (vrshrn_n_u16 (b1, 8), b1, 8); +} + +/* { dg-final { scan-assembler-times {\tmovi\t} 1 } } */ +/* { dg-final { scan-assembler-times {\traddhn\t} 2 } } */ +/* { dg-final { scan-assembler-times {\traddhn2\t} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c new file mode 100644 index 0000000..09d913e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ + +#include <arm_neon.h> + +uint32x4_t foo (uint64x2_t a, uint64x2_t b) +{ + return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32); +} + +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */ +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c new file mode 100644 index 0000000..bdccbb3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ + +#include <arm_neon.h> + +uint16x8_t foo (uint32x4_t a, uint32x4_t b) +{ + return vrshrn_high_n_u32 (vrshrn_n_u32 (a, 16), b, 16); +} + +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */ +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c new file mode 100644 index 0000000..4b23edd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ + +#include <arm_neon.h> + +uint8x16_t foo (uint16x8_t a, uint16x8_t b) +{ + return vrshrn_high_n_u16 (vrshrn_n_u16 (a, 8), b, 8); +} + +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */ +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */ |