aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2024-05-15 13:07:27 +0100
committerWilco Dijkstra <wilco.dijkstra@arm.com>2024-05-15 13:26:07 +0100
commit43fb827f259e6fdea39bc4021950c810be769d58 (patch)
tree655fcad0407435ce58119fc4a9bd00073c663014
parent9b7cad5884f21cc5783075be0043777448db3fab (diff)
downloadgcc-43fb827f259e6fdea39bc4021950c810be769d58.zip
gcc-43fb827f259e6fdea39bc4021950c810be769d58.tar.gz
gcc-43fb827f259e6fdea39bc4021950c810be769d58.tar.bz2
AArch64: Use UZP1 instead of INS
Use UZP1 instead of INS when combining low and high halves of vectors. UZP1 has 3 operands which improves register allocation, and is faster on some microarchitectures. gcc: * config/aarch64/aarch64-simd.md (aarch64_combine_internal<mode>): Use UZP1 instead of INS. (aarch64_combine_internal_be<mode>): Likewise. gcc/testsuite: * gcc.target/aarch64/ldp_stp_16.c: Update to check for UZP1. * gcc.target/aarch64/pr109072_1.c: Likewise. * gcc.target/aarch64/vec-init-14.c: Likewise. * gcc.target/aarch64/vec-init-9.c: Likewise.
-rw-r--r--gcc/config/aarch64/aarch64-simd.md4
-rw-r--r--gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c16
-rw-r--r--gcc/testsuite/gcc.target/aarch64/pr109072_1.c4
-rw-r--r--gcc/testsuite/gcc.target/aarch64/vec-init-14.c4
-rw-r--r--gcc/testsuite/gcc.target/aarch64/vec-init-9.c12
5 files changed, 20 insertions, 20 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f8bb973..16b7445d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4388,7 +4388,7 @@
&& (register_operand (operands[0], <VDBL>mode)
|| register_operand (operands[2], <MODE>mode))"
{@ [ cons: =0 , 1 , 2 ; attrs: type , arch ]
- [ w , 0 , w ; neon_ins<dblq> , simd ] ins\t%0.<single_type>[1], %2.<single_type>[0]
+ [ w , w , w ; neon_permute<dblq> , simd ] uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type>
[ w , 0 , ?r ; neon_from_gp<dblq> , simd ] ins\t%0.<single_type>[1], %<single_wx>2
[ w , 0 , ?r ; f_mcr , * ] fmov\t%0.d[1], %2
[ w , 0 , Utv ; neon_load1_one_lane<dblq> , simd ] ld1\t{%0.<single_type>}[1], %2
@@ -4407,7 +4407,7 @@
&& (register_operand (operands[0], <VDBL>mode)
|| register_operand (operands[2], <MODE>mode))"
{@ [ cons: =0 , 1 , 2 ; attrs: type , arch ]
- [ w , 0 , w ; neon_ins<dblq> , simd ] ins\t%0.<single_type>[1], %2.<single_type>[0]
+ [ w , w , w ; neon_permute<dblq> , simd ] uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type>
[ w , 0 , ?r ; neon_from_gp<dblq> , simd ] ins\t%0.<single_type>[1], %<single_wx>2
[ w , 0 , ?r ; f_mcr , * ] fmov\t%0.d[1], %2
[ w , 0 , Utv ; neon_load1_one_lane<dblq> , simd ] ld1\t{%0.<single_type>}[1], %2
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
index f1f46e0..95835aa 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
@@ -80,16 +80,16 @@ CONS2_FN (2, float);
/*
** cons2_4_float: { target aarch64_little_endian }
-** ins v0.s\[1\], v1.s\[0\]
-** stp d0, d0, \[x0\]
-** stp d0, d0, \[x0, #?16\]
+** uzp1 v([0-9])\.2s, v0\.2s, v1\.2s
+** stp d\1, d\1, \[x0\]
+** stp d\1, d\1, \[x0, #?16\]
** ret
*/
/*
** cons2_4_float: { target aarch64_big_endian }
-** ins v1.s\[1\], v0.s\[0\]
-** stp d1, d1, \[x0\]
-** stp d1, d1, \[x0, #?16\]
+** uzp1 v([0-9])\.2s, v1\.2s, v0\.2s
+** stp d\1, d\1, \[x0\]
+** stp d\1, d\1, \[x0, #?16\]
** ret
*/
CONS2_FN (4, float);
@@ -125,8 +125,8 @@ CONS4_FN (2, float);
/*
** cons4_4_float:
-** ins v[0-9]+\.s[^\n]+
-** ins v[0-9]+\.s[^\n]+
+** uzp1 v[0-9]+\.2s[^\n]+
+** uzp1 v[0-9]+\.2s[^\n]+
** zip1 v([0-9]+).4s, [^\n]+
** stp q\1, q\1, \[x0\]
** stp q\1, q\1, \[x0, #?32\]
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
index 6c1d2b0..0fc195a 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
@@ -54,7 +54,7 @@ f32x2_1 (float32_t x)
/*
** f32x2_2:
-** ins v0\.s\[1\], v1.s\[0\]
+** uzp1 v0\.2s, v0\.2s, v1\.2s
** ret
*/
float32x2_t
@@ -165,7 +165,7 @@ f64x2_1 (float64_t x)
/*
** f64x2_2:
-** ins v0\.d\[1\], v1.d\[0\]
+** uzp1 v0\.2d, v0\.2d, v1\.2d
** ret
*/
float64x2_t
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
index 0287508..1a2cc9f 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
@@ -67,7 +67,7 @@ int32x2_t s32_6(int32_t a0, int32_t a1) {
/*
** f32_1:
-** ins v0\.s\[1\], v1\.s\[0\]
+** uzp1 v0\.2s, v0\.2s, v1\.2s
** ret
*/
float32x2_t f32_1(float32_t a0, float32_t a1) {
@@ -90,7 +90,7 @@ float32x2_t f32_2(float32_t a0, float32_t *ptr) {
/*
** f32_3:
** ldr s0, \[x0\]
-** ins v0\.s\[1\], v1\.s\[0\]
+** uzp1 v0\.2s, v0\.2s, v1\.2s
** ret
*/
float32x2_t f32_3(float32_t a0, float32_t a1, float32_t *ptr) {
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
index 8f68e06a..3cf05cf 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
@@ -75,7 +75,7 @@ int64x2_t s64q_6(int64_t a0, int64_t a1) {
/*
** f64q_1:
-** ins v0\.d\[1\], v1\.d\[0\]
+** uzp1 v0\.2d, v0\.2d, v1\.2d
** ret
*/
float64x2_t f64q_1(float64_t a0, float64_t a1) {
@@ -98,7 +98,7 @@ float64x2_t f64q_2(float64_t a0, float64_t *ptr) {
/*
** f64q_3:
** ldr d0, \[x0\]
-** ins v0\.d\[1\], v1\.d\[0\]
+** uzp1 v0\.2d, v0\.2d, v1\.2d
** ret
*/
float64x2_t f64q_3(float64_t a0, float64_t a1, float64_t *ptr) {
@@ -140,7 +140,7 @@ float64x2_t f64q_6(float64_t a0, float64_t a1) {
/*
** s32q_1:
-** ins v0\.d\[1\], v1\.d\[0\]
+** uzp1 v0\.2d, v0\.2d, v1\.2d
** ret
*/
int32x4_t s32q_1(int32x2_t a0, int32x2_t a1) {
@@ -157,7 +157,7 @@ int32x4_t s32q_2(int32x2_t a0, int32x2_t *ptr) {
/*
** s32q_3:
** ldr d0, \[x0\]
-** ins v0\.d\[1\], v1\.d\[0\]
+** uzp1 v0\.2d, v0\.2d, v1\.2d
** ret
*/
int32x4_t s32q_3(int32x2_t a0, int32x2_t a1, int32x2_t *ptr) {
@@ -204,7 +204,7 @@ int32x4_t s32q_6(int32x2_t a0, int32x2_t a1) {
/*
** f32q_1:
-** ins v0\.d\[1\], v1\.d\[0\]
+** uzp1 v0\.2d, v0\.2d, v1\.2d
** ret
*/
float32x4_t f32q_1(float32x2_t a0, float32x2_t a1) {
@@ -221,7 +221,7 @@ float32x4_t f32q_2(float32x2_t a0, float32x2_t *ptr) {
/*
** f32q_3:
** ldr d0, \[x0\]
-** ins v0\.d\[1\], v1\.d\[0\]
+** uzp1 v0\.2d, v0\.2d, v1\.2d
** ret
*/
float32x4_t f32q_3(float32x2_t a0, float32x2_t a1, float32x2_t *ptr) {