aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPrathamesh Kulkarni <prathamesh.kulkarni@linaro.org>2022-06-12 08:50:16 +0530
committerPrathamesh Kulkarni <prathamesh.kulkarni@linaro.org>2022-06-12 08:55:04 +0530
commit494bec025002df422f2faa947138bf3643d80b54 (patch)
treee6c5ba8e07f688100d879290d8b2f3ad929e34e4
parentcbd842717ec5cab989141bf1575846c2acef818d (diff)
downloadgcc-494bec025002df422f2faa947138bf3643d80b54.zip
gcc-494bec025002df422f2faa947138bf3643d80b54.tar.gz
gcc-494bec025002df422f2faa947138bf3643d80b54.tar.bz2
PR96463: Optimise svld1rq from vectors for little endian AArch64 targets.
The patch folds: lhs = svld1rq({-1, -1, ...}, rhs) into: tmp = mem_ref<vectype> [(elem_type * {ref-all}) rhs] lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3 ...}>. which is then expanded using aarch64_expand_sve_dupq. Example: svint32_t foo (int32x4_t x) { return svld1rq (svptrue_b8 (), &x[0]); } code-gen: foo: .LFB4350: dup z0.q, z0.q[0] ret The patch relaxes type-checking for VEC_PERM_EXPR by allowing different vector types for lhs and rhs provided: (1) rhs3 is constant and has integer type element. (2) len(lhs) == len(rhs3) and len(rhs1) == len(rhs2) (3) lhs and rhs have same element type. gcc/ChangeLog: PR target/96463 * config/aarch64/aarch64-sve-builtins-base.cc: Include ssa.h. (svld1rq_impl::fold): Define. * config/aarch64/aarch64.cc (expand_vec_perm_d): Define new members op_mode and op_vec_flags. (aarch64_evpc_reencode): Initialize newd.op_mode and newd.op_vec_flags. (aarch64_evpc_sve_dup): New function. (aarch64_expand_vec_perm_const_1): Gate existing calls to aarch64_evpc_* functions under d->vmode == d->op_mode, and call aarch64_evpc_sve_dup. (aarch64_vectorize_vec_perm_const): Remove assert d->vmode != d->op_mode, and initialize d.op_mode and d.op_vec_flags. * tree-cfg.cc (verify_gimple_assign_ternary): Allow different vector types for lhs and rhs in VEC_PERM_EXPR if rhs3 is constant. gcc/testsuite/ChangeLog: PR target/96463 * gcc.target/aarch64/sve/acle/general/pr96463-1.c: New test. * gcc.target/aarch64/sve/acle/general/pr96463-2.c: Likewise.
-rw-r--r--gcc/config/aarch64/aarch64-sve-builtins-base.cc59
-rw-r--r--gcc/config/aarch64/aarch64.cc95
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c29
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c29
-rw-r--r--gcc/tree-cfg.cc40
5 files changed, 212 insertions, 40 deletions
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index bee4109..82f9eba 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,7 @@
#include "aarch64-sve-builtins-shapes.h"
#include "aarch64-sve-builtins-base.h"
#include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
using namespace aarch64_sve;
@@ -1207,6 +1208,64 @@ public:
insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
return e.use_contiguous_load_insn (icode);
}
+
+ gimple *
+ fold (gimple_folder &f) const override
+ {
+ tree arg0 = gimple_call_arg (f.call, 0);
+ tree arg1 = gimple_call_arg (f.call, 1);
+
+ /* Transform:
+ lhs = svld1rq ({-1, -1, ... }, arg1)
+ into:
+ tmp = mem_ref<vectype> [(elem * {ref-all}) arg1]
+ lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+ on little endian target.
+ vectype is the corresponding ADVSIMD type. */
+
+ if (!BYTES_BIG_ENDIAN
+ && integer_all_onesp (arg0))
+ {
+ tree lhs = gimple_call_lhs (f.call);
+ tree lhs_type = TREE_TYPE (lhs);
+ poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+ tree eltype = TREE_TYPE (lhs_type);
+
+ scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
+ machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
+ tree vectype = build_vector_type_for_mode (eltype, vq_mode);
+
+ tree elt_ptr_type
+ = build_pointer_type_for_mode (eltype, VOIDmode, true);
+ tree zero = build_zero_cst (elt_ptr_type);
+
+ /* Use element type alignment. */
+ tree access_type
+ = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+ tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+ tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+ gimple *mem_ref_stmt
+ = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+ gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+ int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+ vec_perm_builder sel (lhs_len, source_nelts, 1);
+ for (int i = 0; i < source_nelts; i++)
+ sel.quick_push (i);
+
+ vec_perm_indices indices (sel, 1, source_nelts);
+ gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+ TYPE_MODE (access_type),
+ indices));
+ tree mask_type = build_vector_type (ssizetype, lhs_len);
+ tree mask = vec_perm_indices_to_tree (mask_type, indices);
+ return gimple_build_assign (lhs, VEC_PERM_EXPR,
+ mem_ref_lhs, mem_ref_lhs, mask);
+ }
+
+ return NULL;
+ }
};
class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5969d1f..d21e041 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23342,7 +23342,9 @@ struct expand_vec_perm_d
rtx target, op0, op1;
vec_perm_indices perm;
machine_mode vmode;
+ machine_mode op_mode;
unsigned int vec_flags;
+ unsigned int op_vec_flags;
bool one_vector_p;
bool testing_p;
};
@@ -23577,6 +23579,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
newd.vmode = new_mode;
newd.vec_flags = VEC_ADVSIMD;
+ newd.op_mode = newd.vmode;
+ newd.op_vec_flags = newd.vec_flags;
newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
@@ -23891,6 +23895,33 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
return true;
}
+/* Try to implement D using SVE dup instruction. */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+ if (BYTES_BIG_ENDIAN
+ || !d->one_vector_p
+ || d->vec_flags != VEC_SVE_DATA
+ || d->op_vec_flags != VEC_ADVSIMD
+ || d->perm.encoding ().nelts_per_pattern () != 1
+ || !known_eq (d->perm.encoding ().npatterns (),
+ GET_MODE_NUNITS (d->op_mode))
+ || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+ return false;
+
+ int npatterns = d->perm.encoding ().npatterns ();
+ for (int i = 0; i < npatterns; i++)
+ if (!known_eq (d->perm[i], i))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+ return true;
+}
+
/* Try to implement D using SVE SEL instruction. */
static bool
@@ -24014,6 +24045,8 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
static bool
aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
+ gcc_assert (d->op_mode != E_VOIDmode);
+
/* The pattern matching functions above are written to look for a small
number to begin the sequence (0, 1, N/2). If we begin with an index
from the second operand, we can swap the operands. */
@@ -24030,30 +24063,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
|| d->vec_flags == VEC_SVE_PRED)
&& known_gt (nelt, 1))
{
- if (aarch64_evpc_rev_local (d))
- return true;
- else if (aarch64_evpc_rev_global (d))
- return true;
- else if (aarch64_evpc_ext (d))
- return true;
- else if (aarch64_evpc_dup (d))
- return true;
- else if (aarch64_evpc_zip (d))
- return true;
- else if (aarch64_evpc_uzp (d))
- return true;
- else if (aarch64_evpc_trn (d))
- return true;
- else if (aarch64_evpc_sel (d))
- return true;
- else if (aarch64_evpc_ins (d))
- return true;
- else if (aarch64_evpc_reencode (d))
- return true;
- if (d->vec_flags == VEC_SVE_DATA)
- return aarch64_evpc_sve_tbl (d);
- else if (d->vec_flags == VEC_ADVSIMD)
- return aarch64_evpc_tbl (d);
+ if (d->vmode == d->op_mode)
+ {
+ if (aarch64_evpc_rev_local (d))
+ return true;
+ else if (aarch64_evpc_rev_global (d))
+ return true;
+ else if (aarch64_evpc_ext (d))
+ return true;
+ else if (aarch64_evpc_dup (d))
+ return true;
+ else if (aarch64_evpc_zip (d))
+ return true;
+ else if (aarch64_evpc_uzp (d))
+ return true;
+ else if (aarch64_evpc_trn (d))
+ return true;
+ else if (aarch64_evpc_sel (d))
+ return true;
+ else if (aarch64_evpc_ins (d))
+ return true;
+ else if (aarch64_evpc_reencode (d))
+ return true;
+
+ if (d->vec_flags == VEC_SVE_DATA)
+ return aarch64_evpc_sve_tbl (d);
+ else if (d->vec_flags == VEC_ADVSIMD)
+ return aarch64_evpc_tbl (d);
+ }
+ else
+ {
+ if (aarch64_evpc_sve_dup (d))
+ return true;
+ }
}
return false;
}
@@ -24065,9 +24107,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
rtx target, rtx op0, rtx op1,
const vec_perm_indices &sel)
{
- if (vmode != op_mode)
- return false;
-
struct expand_vec_perm_d d;
/* Check whether the mask can be applied to a single vector. */
@@ -24091,6 +24130,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
sel.nelts_per_input ());
d.vmode = vmode;
d.vec_flags = aarch64_classify_vector_mode (d.vmode);
+ d.op_mode = op_mode;
+ d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
d.target = target;
d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
if (op0 == op1)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
new file mode 100644
index 0000000..b68f43c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type x) \
+{ \
+ return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8x16_t, s8)
+TEST(svint16_t, int16x8_t, s16)
+TEST(svint32_t, int32x4_t, s32)
+TEST(svint64_t, int64x2_t, s64)
+
+TEST(svuint8_t, uint8x16_t, u8)
+TEST(svuint16_t, uint16x8_t, u16)
+TEST(svuint32_t, uint32x4_t, u32)
+TEST(svuint64_t, uint64x2_t, u64)
+
+TEST(svfloat16_t, float16x8_t, f16)
+TEST(svfloat32_t, float32x4_t, f32)
+TEST(svfloat64_t, float64x2_t, f64)
+
+TEST(svbfloat16_t, bfloat16x8_t, bf16)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
new file mode 100644
index 0000000..196de3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type *x) \
+{ \
+ return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8_t, s8)
+TEST(svint16_t, int16_t, s16)
+TEST(svint32_t, int32_t, s32)
+TEST(svint64_t, int64_t, s64)
+
+TEST(svuint8_t, uint8_t, u8)
+TEST(svuint16_t, uint16_t, u16)
+TEST(svuint32_t, uint32_t, u32)
+TEST(svuint64_t, uint64_t, u64)
+
+TEST(svfloat16_t, float16_t, f16)
+TEST(svfloat32_t, float32_t, f32)
+TEST(svfloat64_t, float64_t, f64)
+
+TEST(svbfloat16_t, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 8de1b14..9e5d84a 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -4297,18 +4297,14 @@ verify_gimple_assign_ternary (gassign *stmt)
break;
case VEC_PERM_EXPR:
- if (!useless_type_conversion_p (lhs_type, rhs1_type)
- || !useless_type_conversion_p (lhs_type, rhs2_type))
- {
- error ("type mismatch in %qs", code_name);
- debug_generic_expr (lhs_type);
- debug_generic_expr (rhs1_type);
- debug_generic_expr (rhs2_type);
- debug_generic_expr (rhs3_type);
- return true;
- }
+ /* If permute is constant, then we allow for lhs and rhs
+ to have different vector types, provided:
+ (1) lhs, rhs1, rhs2 have same element type.
+ (2) rhs3 vector is constant and has integer element type.
+ (3) len(lhs) == len(rhs3) && len(rhs1) == len(rhs2). */
- if (TREE_CODE (rhs1_type) != VECTOR_TYPE
+ if (TREE_CODE (lhs_type) != VECTOR_TYPE
+ || TREE_CODE (rhs1_type) != VECTOR_TYPE
|| TREE_CODE (rhs2_type) != VECTOR_TYPE
|| TREE_CODE (rhs3_type) != VECTOR_TYPE)
{
@@ -4320,10 +4316,28 @@ verify_gimple_assign_ternary (gassign *stmt)
return true;
}
+ /* If rhs3 is constant, we allow lhs, rhs1 and rhs2 to be different vector types,
+ as long as lhs, rhs1 and rhs2 have same element type. */
+ if (TREE_CONSTANT (rhs3)
+ ? (!useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs1_type))
+ || !useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs2_type)))
+ : (!useless_type_conversion_p (lhs_type, rhs1_type)
+ || !useless_type_conversion_p (lhs_type, rhs2_type)))
+ {
+ error ("type mismatch in %qs", code_name);
+ debug_generic_expr (lhs_type);
+ debug_generic_expr (rhs1_type);
+ debug_generic_expr (rhs2_type);
+ debug_generic_expr (rhs3_type);
+ return true;
+ }
+
+ /* If rhs3 is constant, relax the check len(rhs2) == len(rhs3). */
if (maybe_ne (TYPE_VECTOR_SUBPARTS (rhs1_type),
TYPE_VECTOR_SUBPARTS (rhs2_type))
- || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type),
- TYPE_VECTOR_SUBPARTS (rhs3_type))
+ || (!TREE_CONSTANT(rhs3)
+ && maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type),
+ TYPE_VECTOR_SUBPARTS (rhs3_type)))
|| maybe_ne (TYPE_VECTOR_SUBPARTS (rhs3_type),
TYPE_VECTOR_SUBPARTS (lhs_type)))
{