aarch64: Emit ADD X, Y, Y instead of SHL X, Y, #1 for Advanced SIMD

On many cores, including Neoverse V2 the throughput of vector ADD instructions is higher than vector shifts like SHL. We can lean on that to emit code like: add v0.4s, v0.4s, v0.4s instead of: shl v0.4s, v0.4s, 1 LLVM already does this trick. In RTL the code gets canonincalised from (plus x x) to (ashift x 1) so I opted to instead do this at the final assembly printing stage, similar to how we emit CMLT instead of SSHR elsewhere in the backend. I'd like to also do this for SVE shifts, but those will have to be separate patches. Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com> gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_simd_imm_shl<mode><vczle><vczbe>): Rewrite to new syntax. Add =w,w,vs1 alternative. * config/aarch64/constraints.md (vs1): New constraint. gcc/testsuite/ChangeLog: * gcc.target/aarch64/advsimd_shl_add.c: New test.
author: Kyrylo Tkachov <ktkachov@nvidia.com> 2024-08-05 11:29:44 -0700
committer: Kyrylo Tkachov <ktkachov@nvidia.com> 2024-08-12 11:41:04 +0200
commit: fcc766c82cf8e0473ba54f1660c8282a7ce3231c (patch)
tree: 47efffe04c8e7d64e763367d485c190e4956c95f /gcc
parent: 8d8db21eb726b785782f4a41ad85a0d4be63068a (diff)
download: gcc-fcc766c82cf8e0473ba54f1660c8282a7ce3231c.zip
gcc-fcc766c82cf8e0473ba54f1660c8282a7ce3231c.tar.gz
gcc-fcc766c82cf8e0473ba54f1660c8282a7ce3231c.tar.bz2
3 files changed, 77 insertions, 5 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cc612ec..475f197 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1352,12 +1352,14 @@
 )
 
 (define_insn "aarch64_simd_imm_shl<mode><vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-		   (match_operand:VDQ_I  2 "aarch64_simd_lshift_imm" "Dl")))]
+ [(set (match_operand:VDQ_I 0 "register_operand")
+       (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand")
+		   (match_operand:VDQ_I  2 "aarch64_simd_lshift_imm")))]
  "TARGET_SIMD"
-  "shl\t%0.<Vtype>, %1.<Vtype>, %2"
-  [(set_attr "type" "neon_shift_imm<q>")]
+  {@ [ cons: =0, 1,  2   ; attrs: type       ]
+     [ w       , w,  vs1 ; neon_add<q>       ] add\t%0.<Vtype>, %1.<Vtype>, %1.<Vtype>
+     [ w       , w,  Dl  ; neon_shift_imm<q> ] shl\t%0.<Vtype>, %1.<Vtype>, %2
+  }
 )
 
 (define_insn "aarch64_simd_reg_sshl<mode><vczle><vczbe>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index a2878f5..f491e4b 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -667,6 +667,12 @@
    SMAX and SMIN operations."
  (match_operand 0 "aarch64_sve_vsm_immediate"))
 
+(define_constraint "vs1"
+  "@internal
+ A constraint that matches a vector of immediate one."
+ (and (match_code "const,const_vector")
+      (match_test "op == CONST1_RTX (GET_MODE (op))")))
+
 (define_constraint "vsA"
   "@internal
    A constraint that matches an immediate operand valid for SVE FADD
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c b/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c
new file mode 100644
index 0000000..a161f89
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-additional-options "--save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef __INT64_TYPE__ __attribute__ ((vector_size (16))) v2di;
+typedef int __attribute__ ((vector_size (16))) v4si;
+typedef short __attribute__ ((vector_size (16))) v8hi;
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef short __attribute__ ((vector_size (8))) v4hi;
+typedef char __attribute__ ((vector_size (8))) v8qi;
+
+#define FUNC(S) \
+S               \
+foo_##S (S a)   \
+{ return a << 1; }
+
+/*
+** foo_v2di:
+**      add	v0.2d, v0.2d, v0.2d
+**      ret
+*/
+
+FUNC (v2di)
+
+/*
+** foo_v4si:
+**      add	v0.4s, v0.4s, v0.4s
+**      ret
+*/
+
+FUNC (v4si)
+
+/*
+** foo_v8hi:
+**      add	v0.8h, v0.8h, v0.8h
+**      ret
+*/
+
+FUNC (v8hi)
+
+/*
+** foo_v16qi:
+**      add	v0.16b, v0.16b, v0.16b
+**      ret
+*/
+
+FUNC (v16qi)
+
+/*
+** foo_v4hi:
+**      add	v0.4h, v0.4h, v0.4h
+**      ret
+*/
+
+FUNC (v4hi)
+
+/*
+** foo_v8qi:
+**      add	v0.8b, v0.8b, v0.8b
+**      ret
+*/
+
+FUNC (v8qi)
+
author	Kyrylo Tkachov <ktkachov@nvidia.com>	2024-08-05 11:29:44 -0700
committer	Kyrylo Tkachov <ktkachov@nvidia.com>	2024-08-12 11:41:04 +0200
commit	fcc766c82cf8e0473ba54f1660c8282a7ce3231c (patch)
tree	47efffe04c8e7d64e763367d485c190e4956c95f /gcc
parent	8d8db21eb726b785782f4a41ad85a0d4be63068a (diff)
download	gcc-fcc766c82cf8e0473ba54f1660c8282a7ce3231c.zip gcc-fcc766c82cf8e0473ba54f1660c8282a7ce3231c.tar.gz gcc-fcc766c82cf8e0473ba54f1660c8282a7ce3231c.tar.bz2