aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTamar Christina <tamar.christina@arm.com>2021-12-02 14:39:22 +0000
committerTamar Christina <tamar.christina@arm.com>2021-12-02 14:39:43 +0000
commit9b8830b6f3920b3ec6b9013230c687dc250bb6e9 (patch)
tree1e5af8440fa2c7ff97be56d2b10d7304084f38dc
parentd47393d0b4d0d498795c4ae1353e6c156c1c4500 (diff)
downloadgcc-9b8830b6f3920b3ec6b9013230c687dc250bb6e9.zip
gcc-9b8830b6f3920b3ec6b9013230c687dc250bb6e9.tar.gz
gcc-9b8830b6f3920b3ec6b9013230c687dc250bb6e9.tar.bz2
AArch64: Optimize right shift rounding narrowing
This optimizes right shift rounding narrow instructions to rounding add narrow high where one vector is 0 when the shift amount is half that of the original input type. i.e. uint32x4_t foo (uint64x2_t a, uint64x2_t b) { return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32); } now generates: foo: movi v3.4s, 0 raddhn v0.2s, v2.2d, v3.2d raddhn2 v0.4s, v2.2d, v3.2d instead of: foo: rshrn v0.2s, v0.2d, 32 rshrn2 v0.4s, v1.2d, 32 ret On Arm cores this is an improvement in both latency and throughput. Because a vector zero is needed I created a new method aarch64_gen_shareable_zero that creates zeros using V4SI and then takes a subreg of the zero to the desired type. This allows CSE to share all the zero constants. gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_gen_shareable_zero): New. * config/aarch64/aarch64-simd.md (aarch64_rshrn<mode>, aarch64_rshrn2<mode>): Generate rounding half-ing add when appropriate. * config/aarch64/aarch64.c (aarch64_gen_shareable_zero): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/advsimd-intrinsics/shrn-1.c: New test. * gcc.target/aarch64/advsimd-intrinsics/shrn-2.c: New test. * gcc.target/aarch64/advsimd-intrinsics/shrn-3.c: New test. * gcc.target/aarch64/advsimd-intrinsics/shrn-4.c: New test.
-rw-r--r--gcc/config/aarch64/aarch64-protos.h1
-rw-r--r--gcc/config/aarch64/aarch64-simd.md65
-rw-r--r--gcc/config/aarch64/aarch64.c12
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c15
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c11
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c11
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c11
7 files changed, 106 insertions, 20 deletions
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f7887d0..f7f5cae 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -846,6 +846,7 @@ const char *aarch64_output_move_struct (rtx *operands);
rtx aarch64_return_addr_rtx (void);
rtx aarch64_return_addr (int, rtx);
rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
+rtx aarch64_gen_shareable_zero (machine_mode);
bool aarch64_simd_mem_operand_p (rtx);
bool aarch64_sve_ld1r_operand_p (rtx);
bool aarch64_sve_ld1rq_operand_p (rtx);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 8e61dd9..175a9f0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1956,20 +1956,32 @@
(match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")]
"TARGET_SIMD"
{
- operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
- INTVAL (operands[2]));
- rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
- if (BYTES_BIG_ENDIAN)
- emit_insn (gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
- operands[2], CONST0_RTX (<VNARROWQ>mode)));
+ if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode))
+ {
+ rtx tmp0 = aarch64_gen_shareable_zero (<MODE>mode);
+ emit_insn (gen_aarch64_raddhn<mode> (operands[0], operands[1], tmp0));
+ }
else
- emit_insn (gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
- operands[2], CONST0_RTX (<VNARROWQ>mode)));
-
- /* The intrinsic expects a narrow result, so emit a subreg that will get
- optimized away as appropriate. */
- emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
- <VNARROWQ2>mode));
+ {
+ rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
+ operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+ INTVAL (operands[2]));
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (
+ gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
+ operands[2],
+ CONST0_RTX (<VNARROWQ>mode)));
+ else
+ emit_insn (
+ gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
+ operands[2],
+ CONST0_RTX (<VNARROWQ>mode)));
+
+ /* The intrinsic expects a narrow result, so emit a subreg that will
+ get optimized away as appropriate. */
+ emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
+ <VNARROWQ2>mode));
+ }
DONE;
}
)
@@ -2049,14 +2061,27 @@
(match_operand:SI 3 "aarch64_simd_shift_imm_offset_<vn_mode>")]
"TARGET_SIMD"
{
- operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
- INTVAL (operands[3]));
- if (BYTES_BIG_ENDIAN)
- emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0], operands[1],
- operands[2], operands[3]));
+ if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ2>mode))
+ {
+ rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
+ emit_insn (gen_aarch64_raddhn2<mode> (operands[0], operands[1],
+ operands[2], tmp));
+ }
else
- emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0], operands[1],
- operands[2], operands[3]));
+ {
+ operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+ INTVAL (operands[3]));
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0],
+ operands[1],
+ operands[2],
+ operands[3]));
+ else
+ emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0],
+ operands[1],
+ operands[2],
+ operands[3]));
+ }
DONE;
}
)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 7389b59..be24b73 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -20414,6 +20414,18 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
== SYMBOL_TINY_ABSOLUTE;
}
+/* Create a 0 constant that is based on V4SI to allow CSE to optimally share
+ the constant creation. */
+
+rtx
+aarch64_gen_shareable_zero (machine_mode mode)
+{
+ machine_mode zmode = V4SImode;
+ rtx tmp = gen_reg_rtx (zmode);
+ emit_move_insn (tmp, CONST0_RTX (zmode));
+ return lowpart_subreg (mode, tmp, zmode);
+}
+
/* Return a const_int vector of VAL. */
rtx
aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c
new file mode 100644
index 0000000..4bc3aa9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+
+#include <arm_neon.h>
+
+uint8x16_t foo (uint32x4_t a, uint32x4_t b)
+{
+ uint16x4_t a1 = vrshrn_n_u32 (a, 16);
+ uint16x8_t b1 = vrshrn_high_n_u32 (a1, b, 16);
+ return vrshrn_high_n_u16 (vrshrn_n_u16 (b1, 8), b1, 8);
+}
+
+/* { dg-final { scan-assembler-times {\tmovi\t} 1 } } */
+/* { dg-final { scan-assembler-times {\traddhn\t} 2 } } */
+/* { dg-final { scan-assembler-times {\traddhn2\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c
new file mode 100644
index 0000000..09d913e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+
+#include <arm_neon.h>
+
+uint32x4_t foo (uint64x2_t a, uint64x2_t b)
+{
+ return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32);
+}
+
+/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
+/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c
new file mode 100644
index 0000000..bdccbb3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+
+#include <arm_neon.h>
+
+uint16x8_t foo (uint32x4_t a, uint32x4_t b)
+{
+ return vrshrn_high_n_u32 (vrshrn_n_u32 (a, 16), b, 16);
+}
+
+/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
+/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c
new file mode 100644
index 0000000..4b23edd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+
+#include <arm_neon.h>
+
+uint8x16_t foo (uint16x8_t a, uint16x8_t b)
+{
+ return vrshrn_high_n_u16 (vrshrn_n_u16 (a, 8), b, 8);
+}
+
+/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
+/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */