aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2022-02-09 16:57:05 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2022-02-09 16:57:05 +0000
commit85ac2fe44fd4acf8350dd74ccb003a2050baad2a (patch)
treebb28b1b1f8c19ae2ca9f34d26630784e34d122b4
parentaeef5c57f161ad0258c5ab066ade2274bef3271a (diff)
downloadgcc-85ac2fe44fd4acf8350dd74ccb003a2050baad2a.zip
gcc-85ac2fe44fd4acf8350dd74ccb003a2050baad2a.tar.gz
gcc-85ac2fe44fd4acf8350dd74ccb003a2050baad2a.tar.bz2
aarch64: Add more vec_combine patterns
vec_combine is really one instruction on aarch64, provided that the lowpart element is in the same register as the destination vector. This patch adds patterns for that. The patch fixes a regression from GCC 8. Before the patch: int64x2_t s64q_1(int64_t a0, int64_t a1) { if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) return (int64x2_t) { a1, a0 }; else return (int64x2_t) { a0, a1 }; } generated: fmov d0, x0 ins v0.d[1], x1 ins v0.d[1], x1 ret whereas GCC 8 generated the more respectable: dup v0.2d, x0 ins v0.d[1], x1 ret gcc/ * config/aarch64/predicates.md (aarch64_reg_or_mem_pair_operand): New predicate. * config/aarch64/aarch64-simd.md (*aarch64_combine_internal<mode>) (*aarch64_combine_internal_be<mode>): New patterns. gcc/testsuite/ * gcc.target/aarch64/vec-init-9.c: New test. * gcc.target/aarch64/vec-init-10.c: Likewise. * gcc.target/aarch64/vec-init-11.c: Likewise.
-rw-r--r--gcc/config/aarch64/aarch64-simd.md62
-rw-r--r--gcc/config/aarch64/predicates.md4
-rw-r--r--gcc/testsuite/gcc.target/aarch64/vec-init-10.c15
-rw-r--r--gcc/testsuite/gcc.target/aarch64/vec-init-11.c12
-rw-r--r--gcc/testsuite/gcc.target/aarch64/vec-init-9.c267
5 files changed, 360 insertions, 0 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d6cd4c7..ead8039 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4326,6 +4326,25 @@
[(set_attr "type" "neon_load1_1reg_q")]
)
+;; This STP pattern is a partial duplicate of the general vec_concat patterns
+;; below. The reason for having both of them is that the alternatives of
+;; the later patterns do not have consistent register preferences: the STP
+;; alternatives have no preference between GPRs and FPRs (and if anything,
+;; the GPR form is more natural for scalar integers) whereas the other
+;; alternatives *require* an FPR for operand 1 and prefer one for operand 2.
+;;
+;; Using "*" to hide the STP alternatives from the RA penalizes cases in
+;; which the destination was always memory. On the other hand, expressing
+;; the true preferences makes GPRs seem more palatable than they really are
+;; for register destinations.
+;;
+;; Despite that, we do still want the general form to have STP alternatives,
+;; in order to handle cases where a register destination is spilled.
+;;
+;; The best compromise therefore seemed to be to have a dedicated STP
+;; pattern to catch cases in which the destination was always memory.
+;; This dedicated pattern must come first.
+
(define_insn "store_pair_lanes<mode>"
[(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn")
(vec_concat:<VDBL>
@@ -4338,6 +4357,49 @@
[(set_attr "type" "neon_stp, store_16")]
)
+;; Form a vector whose least significant half comes from operand 1 and whose
+;; most significant half comes from operand 2. The register alternatives
+;; tie the least significant half to the same register as the destination,
+;; so that only the other half needs to be handled explicitly. For the
+;; reasons given above, the STP alternatives use ? for constraints that
+;; the register alternatives either don't accept or themselves disparage.
+
+(define_insn "*aarch64_combine_internal<mode>"
+ [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
+ (vec_concat:<VDBL>
+ (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")
+ (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))]
+ "TARGET_SIMD
+ && !BYTES_BIG_ENDIAN
+ && (register_operand (operands[0], <VDBL>mode)
+ || register_operand (operands[2], <MODE>mode))"
+ "@
+ ins\t%0.d[1], %2.d[0]
+ ins\t%0.d[1], %2
+ ld1\t{%0.d}[1], %2
+ stp\t%d1, %d2, %y0
+ stp\t%x1, %x2, %y0"
+ [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")]
+)
+
+(define_insn "*aarch64_combine_internal_be<mode>"
+ [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
+ (vec_concat:<VDBL>
+ (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r")
+ (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")))]
+ "TARGET_SIMD
+ && BYTES_BIG_ENDIAN
+ && (register_operand (operands[0], <VDBL>mode)
+ || register_operand (operands[2], <MODE>mode))"
+ "@
+ ins\t%0.d[1], %2.d[0]
+ ins\t%0.d[1], %2
+ ld1\t{%0.d}[1], %2
+ stp\t%d2, %d1, %y0
+ stp\t%x2, %x1, %y0"
+ [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")]
+)
+
;; In this insn, operand 1 should be low, and operand 2 the high part of the
;; dest vector.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 7dc4c15..c308015 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -254,6 +254,10 @@
false,
ADDR_QUERY_LDP_STP_N)")))
+(define_predicate "aarch64_reg_or_mem_pair_operand"
+ (ior (match_operand 0 "register_operand")
+ (match_operand 0 "aarch64_mem_pair_lanes_operand")))
+
(define_predicate "aarch64_prefetch_operand"
(match_test "aarch64_address_valid_for_prefetch_p (op, false)"))
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-10.c b/gcc/testsuite/gcc.target/aarch64/vec-init-10.c
new file mode 100644
index 0000000..f5dd83b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-10.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+int64x2_t f1(int64_t *x, int c) {
+ return c ? (int64x2_t) { x[0], x[2] } : (int64x2_t) { 0, 0 };
+}
+
+int64x2_t f2(int64_t *x, int i0, int i1, int c) {
+ return c ? (int64x2_t) { x[i0], x[i1] } : (int64x2_t) { 0, 0 };
+}
+
+/* { dg-final { scan-assembler-times {\t(?:ldr\td[0-9]+|ld1\t)} 4 } } */
+/* { dg-final { scan-assembler-not {\tldr\tx} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-11.c b/gcc/testsuite/gcc.target/aarch64/vec-init-11.c
new file mode 100644
index 0000000..df24270
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-11.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+void f1(int64x2_t *res, int64_t *x, int c0, int c1) {
+ res[0] = (int64x2_t) { c0 ? x[0] : 0, c1 ? x[2] : 0 };
+}
+
+/* { dg-final { scan-assembler-times {\tldr\tx[0-9]+} 2 } } */
+/* { dg-final { scan-assembler {\tstp\tx[0-9]+, x[0-9]+} } } */
+/* { dg-final { scan-assembler-not {\tldr\td} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
new file mode 100644
index 0000000..8f68e06a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
@@ -0,0 +1,267 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+void ext();
+
+/*
+** s64q_1:
+** fmov d0, x0
+** ins v0\.d\[1\], x1
+** ret
+*/
+int64x2_t s64q_1(int64_t a0, int64_t a1) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ return (int64x2_t) { a1, a0 };
+ else
+ return (int64x2_t) { a0, a1 };
+}
+/*
+** s64q_2:
+** fmov d0, x0
+** ld1 {v0\.d}\[1\], \[x1\]
+** ret
+*/
+int64x2_t s64q_2(int64_t a0, int64_t *ptr) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ return (int64x2_t) { ptr[0], a0 };
+ else
+ return (int64x2_t) { a0, ptr[0] };
+}
+/*
+** s64q_3:
+** ldr d0, \[x0\]
+** ins v0\.d\[1\], x1
+** ret
+*/
+int64x2_t s64q_3(int64_t *ptr, int64_t a1) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ return (int64x2_t) { a1, ptr[0] };
+ else
+ return (int64x2_t) { ptr[0], a1 };
+}
+/*
+** s64q_4:
+** stp x1, x2, \[x0\]
+** ret
+*/
+void s64q_4(int64x2_t *res, int64_t a0, int64_t a1) {
+ res[0] = (int64x2_t) { a0, a1 };
+}
+/*
+** s64q_5:
+** stp x1, x2, \[x0, #?8\]
+** ret
+*/
+void s64q_5(uintptr_t res, int64_t a0, int64_t a1) {
+ *(int64x2_t *)(res + 8) = (int64x2_t) { a0, a1 };
+}
+/*
+** s64q_6:
+** ...
+** stp x0, x1, .*
+** ...
+** ldr q0, .*
+** ...
+** ret
+*/
+int64x2_t s64q_6(int64_t a0, int64_t a1) {
+ int64x2_t res = { a0, a1 };
+ ext ();
+ return res;
+}
+
+/*
+** f64q_1:
+** ins v0\.d\[1\], v1\.d\[0\]
+** ret
+*/
+float64x2_t f64q_1(float64_t a0, float64_t a1) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ return (float64x2_t) { a1, a0 };
+ else
+ return (float64x2_t) { a0, a1 };
+}
+/*
+** f64q_2:
+** ld1 {v0\.d}\[1\], \[x0\]
+** ret
+*/
+float64x2_t f64q_2(float64_t a0, float64_t *ptr) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ return (float64x2_t) { ptr[0], a0 };
+ else
+ return (float64x2_t) { a0, ptr[0] };
+}
+/*
+** f64q_3:
+** ldr d0, \[x0\]
+** ins v0\.d\[1\], v1\.d\[0\]
+** ret
+*/
+float64x2_t f64q_3(float64_t a0, float64_t a1, float64_t *ptr) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ return (float64x2_t) { a1, ptr[0] };
+ else
+ return (float64x2_t) { ptr[0], a1 };
+}
+/*
+** f64q_4:
+** stp d0, d1, \[x0\]
+** ret
+*/
+void f64q_4(float64x2_t *res, float64_t a0, float64_t a1) {
+ res[0] = (float64x2_t) { a0, a1 };
+}
+/*
+** f64q_5:
+** stp d0, d1, \[x0, #?8\]
+** ret
+*/
+void f64q_5(uintptr_t res, float64_t a0, float64_t a1) {
+ *(float64x2_t *)(res + 8) = (float64x2_t) { a0, a1 };
+}
+/*
+** f64q_6:
+** ...
+** stp d0, d1, .*
+** ...
+** ldr q0, .*
+** ...
+** ret
+*/
+float64x2_t f64q_6(float64_t a0, float64_t a1) {
+ float64x2_t res = { a0, a1 };
+ ext ();
+ return res;
+}
+
+/*
+** s32q_1:
+** ins v0\.d\[1\], v1\.d\[0\]
+** ret
+*/
+int32x4_t s32q_1(int32x2_t a0, int32x2_t a1) {
+ return vcombine_s32 (a0, a1);
+}
+/*
+** s32q_2:
+** ld1 {v0\.d}\[1\], \[x0\]
+** ret
+*/
+int32x4_t s32q_2(int32x2_t a0, int32x2_t *ptr) {
+ return vcombine_s32 (a0, ptr[0]);
+}
+/*
+** s32q_3:
+** ldr d0, \[x0\]
+** ins v0\.d\[1\], v1\.d\[0\]
+** ret
+*/
+int32x4_t s32q_3(int32x2_t a0, int32x2_t a1, int32x2_t *ptr) {
+ return vcombine_s32 (ptr[0], a1);
+}
+/*
+** s32q_4:
+** stp d0, d1, \[x0\]
+** ret
+*/
+void s32q_4(int32x4_t *res, int32x2_t a0, int32x2_t a1) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ res[0] = vcombine_s32 (a1, a0);
+ else
+ res[0] = vcombine_s32 (a0, a1);
+}
+/*
+** s32q_5:
+** stp d0, d1, \[x0, #?8\]
+** ret
+*/
+void s32q_5(uintptr_t res, int32x2_t a0, int32x2_t a1) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ *(int32x4_t *)(res + 8) = vcombine_s32 (a1, a0);
+ else
+ *(int32x4_t *)(res + 8) = vcombine_s32 (a0, a1);
+}
+/*
+** s32q_6:
+** ...
+** stp d0, d1, .*
+** ...
+** ldr q0, .*
+** ...
+** ret
+*/
+int32x4_t s32q_6(int32x2_t a0, int32x2_t a1) {
+ int32x4_t res = (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ ? vcombine_s32 (a1, a0)
+ : vcombine_s32 (a0, a1));
+ ext ();
+ return res;
+}
+
+/*
+** f32q_1:
+** ins v0\.d\[1\], v1\.d\[0\]
+** ret
+*/
+float32x4_t f32q_1(float32x2_t a0, float32x2_t a1) {
+ return vcombine_f32 (a0, a1);
+}
+/*
+** f32q_2:
+** ld1 {v0\.d}\[1\], \[x0\]
+** ret
+*/
+float32x4_t f32q_2(float32x2_t a0, float32x2_t *ptr) {
+ return vcombine_f32 (a0, ptr[0]);
+}
+/*
+** f32q_3:
+** ldr d0, \[x0\]
+** ins v0\.d\[1\], v1\.d\[0\]
+** ret
+*/
+float32x4_t f32q_3(float32x2_t a0, float32x2_t a1, float32x2_t *ptr) {
+ return vcombine_f32 (ptr[0], a1);
+}
+/*
+** f32q_4:
+** stp d0, d1, \[x0\]
+** ret
+*/
+void f32q_4(float32x4_t *res, float32x2_t a0, float32x2_t a1) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ res[0] = vcombine_f32 (a1, a0);
+ else
+ res[0] = vcombine_f32 (a0, a1);
+}
+/*
+** f32q_5:
+** stp d0, d1, \[x0, #?8\]
+** ret
+*/
+void f32q_5(uintptr_t res, float32x2_t a0, float32x2_t a1) {
+ if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ *(float32x4_t *)(res + 8) = vcombine_f32 (a1, a0);
+ else
+ *(float32x4_t *)(res + 8) = vcombine_f32 (a0, a1);
+}
+/*
+** f32q_6:
+** ...
+** stp d0, d1, .*
+** ...
+** ldr q0, .*
+** ...
+** ret
+*/
+float32x4_t f32q_6(float32x2_t a0, float32x2_t a1) {
+ float32x4_t res = (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ ? vcombine_f32 (a1, a0)
+ : vcombine_f32 (a0, a1));
+ ext ();
+ return res;
+}