aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorVictor Do Nascimento <victor.donascimento@arm.com>2023-04-25 10:57:00 +0100
committerVictor Do Nascimento <victor.donascimento@arm.com>2023-04-25 11:44:42 +0100
commit85279b0bddc1c5a7d181e2168e26ded354b21f32 (patch)
treebe7309c8e1843a73c069ef2f450c0512142ade0d /gcc
parenta024ac7bca9b9de1d2e0c19d4bb11df293e27a7d (diff)
downloadgcc-85279b0bddc1c5a7d181e2168e26ded354b21f32.zip
gcc-85279b0bddc1c5a7d181e2168e26ded354b21f32.tar.gz
gcc-85279b0bddc1c5a7d181e2168e26ded354b21f32.tar.bz2
aarch64: Leveraging the use of STP instruction for vec_duplicate
The backend pattern for storing a pair of identical values in 32 and 64-bit modes with the machine instruction STP was missing, and multiple instructions were needed to reproduce this behavior as a result of failed RTL pattern match in combine pass. For the test case: typedef long long v2di __attribute__((vector_size (16))); typedef int v2si __attribute__((vector_size (8))); void foo (v2di *x, long long a) { v2di tmp = {a, a}; *x = tmp; } void foo2 (v2si *x, int a) { v2si tmp = {a, a}; *x = tmp; } at -O2 on aarch64 gives: foo: stp x1, x1, [x0] ret foo2: stp w1, w1, [x0] ret instead of: foo: dup v0.2d, x1 str q0, [x0] ret foo2: dup v0.2s, w1 str d0, [x0] ret Bootstrapped and regtested on aarch64-none-linux-gnu. gcc/ * config/aarch64/aarch64-simd.md(aarch64_simd_stp<mode>): New. * config/aarch64/constraints.md: Make "Umn" relaxed memory constraint. * config/aarch64/iterators.md(ldpstp_vel_sz): New. gcc/testsuite/ * gcc.target/aarch64/stp_vec_dup_32_64-1.c: New.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/aarch64/aarch64-simd.md10
-rw-r--r--gcc/config/aarch64/constraints.md2
-rw-r--r--gcc/config/aarch64/iterators.md3
-rw-r--r--gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c57
4 files changed, 71 insertions, 1 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f2fce6..cfad812 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -257,6 +257,16 @@
[(set_attr "type" "neon_stp")]
)
+(define_insn "aarch64_simd_stp<mode>"
+ [(set (match_operand:VP_2E 0 "aarch64_mem_pair_lanes_operand" "=Umn,Umn")
+ (vec_duplicate:VP_2E (match_operand:<VEL> 1 "register_operand" "w,r")))]
+ "TARGET_SIMD"
+ "@
+ stp\\t%<Vetype>1, %<Vetype>1, %y0
+ stp\\t%<vw>1, %<vw>1, %y0"
+ [(set_attr "type" "neon_stp, store_<ldpstp_vel_sz>")]
+)
+
(define_insn "load_pair<VQ:mode><VQ2:mode>"
[(set (match_operand:VQ 0 "register_operand" "=w")
(match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 5b20abc..6df1dbe 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -287,7 +287,7 @@
;; Used for storing or loading pairs in an AdvSIMD register using an STP/LDP
;; as a vector-concat. The address mode uses the same constraints as if it
;; were for a single value.
-(define_memory_constraint "Umn"
+(define_relaxed_memory_constraint "Umn"
"@internal
A memory address suitable for a load/store pair operation."
(and (match_code "mem")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 13a7e89..1d0b482 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1020,6 +1020,9 @@
;; Likewise for load/store pair.
(define_mode_attr ldpstp_sz [(SI "8") (DI "16")])
+;; Size of element access for STP/LDP-generated vectors.
+(define_mode_attr ldpstp_vel_sz [(V2SI "8") (V2SF "8") (V2DI "16") (V2DF "16")])
+
;; For inequal width int to float conversion
(define_mode_attr w1 [(HF "w") (SF "w") (DF "x")])
(define_mode_attr w2 [(HF "x") (SF "x") (DF "w")])
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c
new file mode 100644
index 0000000..fc2c1ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (8)));
+
+#define TESTV2DI(lab, idx) \
+ void \
+ stpv2di_##lab (v2di *x, long long a) \
+ { \
+ v2di tmp = {a, a}; \
+ x[idx] = tmp; \
+ }
+
+
+#define TESTV2SI(lab, idx) \
+ void \
+ stpv2si_##lab (v2si *x, int a) \
+ { \
+ v2si tmp = {a, a}; \
+ x[idx] = tmp; \
+ } \
+
+/* Core test, no imm assembler offset: */
+
+TESTV2SI(0, 0)
+TESTV2DI(0, 0)
+/* { dg-final { scan-assembler {\s+stp\t(w[0-9]+), \1, \[x[0-9]+\]} } } */
+/* { dg-final { scan-assembler {\s+stp\t(x[0-9]+), \1, \[x[0-9]+\]} } } */
+
+/* Lower offset bounds: */
+
+/* Vaid offsets: */
+TESTV2SI(1, -32)
+TESTV2DI(1, -32)
+/* { dg-final { scan-assembler {\s+stp\t(w[0-9]+), \1, \[x[0-9]+, -256\]} } } */
+/* { dg-final { scan-assembler {\s+stp\t(x[0-9]+), \1, \[x[0-9]+, -512\]} } } */
+/* Invalid offsets: */
+TESTV2SI(2, -33)
+TESTV2DI(2, -33)
+/* { dg-final { scan-assembler-not {\s+stp\t(w[0-9]+), \1, \[x[0-9]+, -264\]} } } */
+/* { dg-final { scan-assembler-not {\s+stp\t(x[0-9]+), \1, \[x[0-9]+, -528\]} } } */
+
+/* Upper offset bounds: */
+
+/* Valid offsets: */
+TESTV2SI(3, 31)
+TESTV2DI(3, 31)
+/* { dg-final { scan-assembler {\s+stp\t(w[0-9]+), \1, \[x[0-9]+, 248\]} } } */
+/* { dg-final { scan-assembler {\s+stp\t(x[0-9]+), \1, \[x[0-9]+, 496\]} } } */
+/* Invalid offsets: */
+TESTV2SI(4, 32)
+TESTV2DI(4, 32)
+/* { dg-final { scan-assembler-not {\s+stp\t(w[0-9]+), \1, \[x[0-9]+, 256\]} } } */
+/* { dg-final { scan-assembler-not {\s+stp\t(x[0-9]+), \1, \[x[0-9]+, 512\]} } } */
+
+