aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2024-12-30 12:50:56 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2024-12-30 12:50:56 +0000
commitea66f57c9603312a8e4117b719d55becbc28ec43 (patch)
treebd29dfcac1bb9ff849f93df3e4c9e7da564a5cdd /gcc/config
parent5f40ff8efde2b8b140f170619e99b6df9722f79d (diff)
downloadgcc-ea66f57c9603312a8e4117b719d55becbc28ec43.zip
gcc-ea66f57c9603312a8e4117b719d55becbc28ec43.tar.gz
gcc-ea66f57c9603312a8e4117b719d55becbc28ec43.tar.bz2
aarch64: Add mf8 data movement intrinsics
This patch adds mf8 variants of what I'll loosely call the existing "data movement" intrinsics, including the recent FEAT_LUT ones. I think this completes the FP8 intrinsic definitions. The new intrinsics are defined entirely in the compiler. This should make it easy to move the existing non-mf8 variants into the compiler as well, but that's too invasive for stage 3 and so is left to GCC 16. I wondered about trying to reduce the cut-&-paste in the .def file, but in the end decided against it. I have a plan for specifying this information in a different format, but again that would need to wait until GCC 16. The patch includes some support for gimple folding. I initially tested the patch without it, so that all the rtl expansion code was exercised. vlut.c fails for all types with big-endian ILP32, but that's for a later patch. gcc/ * config/aarch64/aarch64.md (UNSPEC_BSL, UNSPEC_COMBINE, UNSPEC_DUP) (UNSPEC_DUP_LANE, UNSPEC_GET_LANE, UNSPEC_LD1_DUP, UNSPEC_LD1x2) (UNSPEC_LD1x3, UNSPEC_LD1x4, UNSPEC_SET_LANE, UNSPEC_ST1_LANE) (USNEPC_ST1x2, UNSPEC_ST1x3, UNSPEC_ST1x4, UNSPEC_VCREATE) (UNSPEC_VEC_COPY): New unspecs. * config/aarch64/iterators.md (UNSPEC_TBL): Likewise. * config/aarch64/aarch64-simd-pragma-builtins.def: Add definitions of the mf8 data movement intrinsics. * config/aarch64/aarch64-protos.h (aarch64_advsimd_vector_array_mode): Declare. * config/aarch64/aarch64.cc (aarch64_advsimd_vector_array_mode): Make public. * config/aarch64/aarch64-builtins.h (qualifier_const_pointer): New aarch64_type_qualifiers member. * config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_LOW_BUILTINS) (AARCH64_SIMD_VGET_HIGH_BUILTINS): Add mf8 variants. (aarch64_int_or_fp_type): Handle qualifier_modal_float. (aarch64_num_lanes): New function. (binary_two_lanes, load, load_lane, store, store_lane): New signatures. (unary_lane): Likewise. (simd_type::nunits): New member function. (simd_types): Add pointer types. (aarch64_fntype): Handle the new signatures. (require_immediate_lane_index): Use aarch64_num_lanes. (aarch64_pragma_builtins_checker::check): Handle the new intrinsics. (aarch64_convert_address): (aarch64_dereference_pointer): (aarch64_canonicalize_lane, aarch64_convert_to_lane_mask) (aarch64_pack_into_v128s, aarch64_expand_permute_pair) (aarch64_expand_tbl_tbx): New functions. (aarch64_expand_pragma_builtin): Handle the new intrinsics. (aarch64_force_gimple_val, aarch64_copy_vops, aarch64_fold_to_val) (aarch64_dereference, aarch64_get_lane_bit_index, aarch64_get_lane) (aarch64_set_lane, aarch64_fold_combine, aarch64_fold_load) (aarch64_fold_store, aarch64_ext_index, aarch64_rev_index) (aarch64_trn_index, aarch64_uzp_index, aarch64_zip_index) (aarch64_fold_permute): New functions, some split out from aarch64_general_gimple_fold_builtin. (aarch64_gimple_fold_pragma_builtin): New function. (aarch64_general_gimple_fold_builtin): Use the new functions above. * config/aarch64/aarch64-simd.md (aarch64_dup_lane<mode>) (aarch64_dup_lane_<vswap_width_name><mode>): Add "@" to name. (aarch64_simd_vec_set<mode>): Likewise. (*aarch64_simd_vec_copy_lane_<vswap_width_name><mode>): Likewise. (aarch64_simd_bsl<mode>): Likewise. (aarch64_combine<mode>): Likewise. (aarch64_cm<optab><mode><vczle><vczbe>): Likewise. (aarch64_simd_ld2r<vstruct_elt>): Likewise. (aarch64_vec_load_lanes<mode>_lane<vstruct_elt>): Likewise. (aarch64_simd_ld3r<vstruct_elt>): Likewise. (aarch64_simd_ld4r<vstruct_elt>): Likewise. (aarch64_ld1x3<vstruct_elt>): Likewise. (aarch64_ld1x4<vstruct_elt>): Likewise. (aarch64_st1x2<vstruct_elt>): Likewise. (aarch64_st1x3<vstruct_elt>): Likewise. (aarch64_st1x4<vstruct_elt>): Likewise. (aarch64_ld<nregs><vstruct_elt>): Likewise. (aarch64_ld1<VALL_F16: Likewise.mode>): Likewise. (aarch64_ld1x2<vstruct_elt>): Likewise. (aarch64_ld<nregs>_lane<vstruct_elt>): Likewise. (aarch64_<PERMUTE: Likewise.perm_insn><mode><vczle><vczbe>): Likewise. (aarch64_ext<mode>): Likewise. (aarch64_rev<REVERSE: Likewise.rev_op><mode><vczle><vczbe>): Likewise. (aarch64_st<nregs><vstruct_elt>): Likewise. (aarch64_st<nregs>_lane<vstruct_elt>): Likewise. (aarch64_st1<VALL_F16: Likewise.mode>): Likewise. gcc/testsuite/ * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: Add mfloat8 support. * gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vbsl.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vcombine.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vcreate.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vext.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vget_high.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld1x2.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld1x4.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vldX.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vrev.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vset_lane.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vshuffle.inc: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst1x4.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vtbX.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vtrn.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vtrn_half.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vuzp.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vuzp_half.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vzip.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vzip_half.c: Likewise. * gcc.target/aarch64/simd/lut.c: Likewise. * gcc.target/aarch64/vdup_lane_1.c: Likewise. * gcc.target/aarch64/vdup_lane_2.c: Likewise. * gcc.target/aarch64/vdup_n_1.c: Likewise. * gcc.target/aarch64/vect_copy_lane_1.c: Likewise. * gcc.target/aarch64/simd/mf8_data_1.c: New test. * gcc.target/aarch64/simd/mf8_data_2.c: Likewise. Co-authored-by: Saurabh Jha <saurabh.jha@arm.com>
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/aarch64/aarch64-builtins.cc889
-rw-r--r--gcc/config/aarch64/aarch64-builtins.h2
-rw-r--r--gcc/config/aarch64/aarch64-protos.h2
-rw-r--r--gcc/config/aarch64/aarch64-simd-pragma-builtins.def246
-rw-r--r--gcc/config/aarch64/aarch64-simd.md60
-rw-r--r--gcc/config/aarch64/aarch64.cc2
-rw-r--r--gcc/config/aarch64/aarch64.md16
-rw-r--r--gcc/config/aarch64/iterators.md1
8 files changed, 1092 insertions, 126 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index 9d1d026..6b3e220 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -702,6 +702,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
VREINTERPRETQ_BUILTINS
#define AARCH64_SIMD_VGET_LOW_BUILTINS \
+ VGET_LOW_BUILTIN(mf8) \
VGET_LOW_BUILTIN(f16) \
VGET_LOW_BUILTIN(f32) \
VGET_LOW_BUILTIN(f64) \
@@ -719,6 +720,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
VGET_LOW_BUILTIN(bf16)
#define AARCH64_SIMD_VGET_HIGH_BUILTINS \
+ VGET_HIGH_BUILTIN(mf8) \
VGET_HIGH_BUILTIN(f16) \
VGET_HIGH_BUILTIN(f32) \
VGET_HIGH_BUILTIN(f64) \
@@ -1096,6 +1098,8 @@ aarch64_int_or_fp_type (machine_mode mode,
switch (mode)
{
case E_QImode:
+ if (qualifiers & qualifier_modal_float)
+ return aarch64_mfp8_type_node;
return QUAL_TYPE (QI);
case E_HImode:
return QUAL_TYPE (HI);
@@ -1333,6 +1337,16 @@ aarch64_init_simd_builtin_scalar_types (void)
"__builtin_aarch64_simd_udi");
}
+/* If MODE is a single Advanced SIMD vector, return the number of lanes in the
+ vector. If MODE is an Advanced SIMD structure/tuple mode, return the number
+ of lanes in a single vector. */
+static unsigned int
+aarch64_num_lanes (machine_mode mode)
+{
+ unsigned int nregs = targetm.hard_regno_nregs (V0_REGNUM, mode);
+ return exact_div (GET_MODE_NUNITS (mode), nregs).to_constant ();
+}
+
/* Return a set of FLAG_* flags derived from FLAGS
that describe what a function with result MODE could do,
taking the command-line flags into account. */
@@ -1620,9 +1634,15 @@ enum class aarch64_builtin_signatures
{
binary,
binary_lane,
+ binary_two_lanes,
+ load,
+ load_lane,
+ store,
+ store_lane,
ternary,
ternary_lane,
unary,
+ unary_lane,
};
namespace {
@@ -1631,22 +1651,27 @@ namespace {
function argument type or return type. */
struct simd_type {
tree type () const { return aarch64_simd_builtin_type (mode, qualifiers); }
+ unsigned nunits () const { return GET_MODE_NUNITS (mode).to_constant (); }
machine_mode mode;
aarch64_type_qualifiers qualifiers;
};
namespace simd_types {
-#define VARIANTS(BASE, D, Q, MODE, QUALIFIERS) \
- constexpr simd_type BASE { V##D##MODE, QUALIFIERS }; \
- constexpr simd_type BASE##x2 { V2x##D##MODE, QUALIFIERS }; \
- constexpr simd_type BASE##x3 { V3x##D##MODE, QUALIFIERS }; \
- constexpr simd_type BASE##x4 { V4x##D##MODE, QUALIFIERS }; \
- constexpr simd_type BASE##q { V##Q##MODE, QUALIFIERS }; \
- constexpr simd_type BASE##qx2 { V2x##Q##MODE, QUALIFIERS }; \
- constexpr simd_type BASE##qx3 { V3x##Q##MODE, QUALIFIERS }; \
- constexpr simd_type BASE##qx4 { V4x##Q##MODE, QUALIFIERS }; \
- constexpr simd_type BASE##_scalar { MODE, QUALIFIERS };
+#define VARIANTS(BASE, D, Q, MODE, QUALIFIERS) \
+ constexpr simd_type BASE { V##D##MODE, QUALIFIERS }; \
+ constexpr simd_type BASE##x2 { V2x##D##MODE, QUALIFIERS }; \
+ constexpr simd_type BASE##x3 { V3x##D##MODE, QUALIFIERS }; \
+ constexpr simd_type BASE##x4 { V4x##D##MODE, QUALIFIERS }; \
+ constexpr simd_type BASE##q { V##Q##MODE, QUALIFIERS }; \
+ constexpr simd_type BASE##qx2 { V2x##Q##MODE, QUALIFIERS }; \
+ constexpr simd_type BASE##qx3 { V3x##Q##MODE, QUALIFIERS }; \
+ constexpr simd_type BASE##qx4 { V4x##Q##MODE, QUALIFIERS }; \
+ constexpr simd_type BASE##_scalar { MODE, QUALIFIERS }; \
+ constexpr simd_type BASE##_scalar_ptr \
+ { MODE, aarch64_type_qualifiers (QUALIFIERS | qualifier_pointer) }; \
+ constexpr simd_type BASE##_scalar_const_ptr \
+ { MODE, aarch64_type_qualifiers (QUALIFIERS | qualifier_const_pointer) };
VARIANTS (mf8, 8, 16, QImode, qualifier_modal_float)
VARIANTS (p8, 8, 16, QImode, qualifier_poly)
@@ -1707,27 +1732,50 @@ aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
{
case aarch64_builtin_signatures::binary:
case aarch64_builtin_signatures::binary_lane:
+ case aarch64_builtin_signatures::load_lane:
return_type = builtin_data.types[0].type ();
for (int i = 1; i <= 2; ++i)
arg_types.quick_push (builtin_data.types[i].type ());
break;
- case aarch64_builtin_signatures::ternary:
- case aarch64_builtin_signatures::ternary_lane:
+ case aarch64_builtin_signatures::binary_two_lanes:
+ /* binary_two_lanes has to be handled as a special case because indices
+ interleave vectors. */
return_type = builtin_data.types[0].type ();
- for (int i = 1; i <= 3; ++i)
- arg_types.quick_push (builtin_data.types[i].type ());
+ arg_types.quick_push (builtin_data.types[1].type ());
+ arg_types.quick_push (integer_type_node);
+ arg_types.quick_push (builtin_data.types[2].type ());
+ arg_types.quick_push (integer_type_node);
break;
+ case aarch64_builtin_signatures::load:
case aarch64_builtin_signatures::unary:
+ case aarch64_builtin_signatures::unary_lane:
return_type = builtin_data.types[0].type ();
arg_types.quick_push (builtin_data.types[1].type ());
break;
+
+ case aarch64_builtin_signatures::store:
+ case aarch64_builtin_signatures::store_lane:
+ return_type = void_type_node;
+ for (int i = 0; i <= 1; ++i)
+ arg_types.quick_push (builtin_data.types[i].type ());
+ break;
+
+ case aarch64_builtin_signatures::ternary:
+ case aarch64_builtin_signatures::ternary_lane:
+ return_type = builtin_data.types[0].type ();
+ for (int i = 1; i <= 3; ++i)
+ arg_types.quick_push (builtin_data.types[i].type ());
+ break;
}
switch (builtin_data.signature)
{
case aarch64_builtin_signatures::binary_lane:
+ case aarch64_builtin_signatures::load_lane:
+ case aarch64_builtin_signatures::store_lane:
case aarch64_builtin_signatures::ternary_lane:
+ case aarch64_builtin_signatures::unary_lane:
arg_types.quick_push (integer_type_node);
break;
@@ -2654,8 +2702,9 @@ require_immediate_lane_index (unsigned int lane_argno, unsigned vec_argno,
{
auto vec_mode = TYPE_MODE (TREE_TYPE (args[vec_argno]));
auto elt_mode = TYPE_MODE (TREE_TYPE (args[elt_argno]));
- auto nunits = exact_div (GET_MODE_SIZE (vec_mode),
- GET_MODE_UNIT_SIZE (elt_mode)).to_constant ();
+ auto nunits = (aarch64_num_lanes (vec_mode)
+ * GET_MODE_UNIT_SIZE (vec_mode)
+ / GET_MODE_UNIT_SIZE (elt_mode));
return require_immediate_range (lane_argno, 0, nunits - 1);
}
@@ -2674,8 +2723,25 @@ require_immediate_lane_index (unsigned int lane_argno, unsigned int vec_argno)
bool
aarch64_pragma_builtins_checker::check ()
{
+ auto &types = builtin_data.types;
+
switch (builtin_data.unspec)
{
+ case UNSPEC_DUP_LANE:
+ case UNSPEC_GET_LANE:
+ case UNSPEC_LD2_LANE:
+ case UNSPEC_LD3_LANE:
+ case UNSPEC_LD4_LANE:
+ case UNSPEC_SET_LANE:
+ case UNSPEC_ST1_LANE:
+ case UNSPEC_ST2_LANE:
+ case UNSPEC_ST3_LANE:
+ case UNSPEC_ST4_LANE:
+ return require_immediate_lane_index (nargs - 1, nargs - 2);
+
+ case UNSPEC_EXT:
+ return require_immediate_range (2, 0, types[2].nunits () - 1);
+
case UNSPEC_FDOT_LANE_FP8:
return require_immediate_lane_index (nargs - 2, nargs - 3, 0);
@@ -2695,11 +2761,8 @@ aarch64_pragma_builtins_checker::check ()
case UNSPEC_LUTI2:
case UNSPEC_LUTI4:
{
- auto vector_to_index_mode = builtin_data.types[nargs - 1].mode;
- int vector_to_index_nunits
- = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
- int output_mode_nunits
- = GET_MODE_NUNITS (builtin_data.types[0].mode).to_constant ();
+ auto vector_to_index_nunits = types[nargs - 1].nunits ();
+ int output_mode_nunits = types[0].nunits ();
int high;
if (builtin_data.unspec == UNSPEC_LUTI2)
@@ -2710,6 +2773,11 @@ aarch64_pragma_builtins_checker::check ()
return require_immediate_range (nargs - 1, 0, high);
}
+ case UNSPEC_VEC_COPY:
+ /* & rather than && so that we report errors against both indices. */
+ return (require_immediate_lane_index (1, 0)
+ & require_immediate_lane_index (3, 2));
+
default:
return true;
}
@@ -3622,6 +3690,52 @@ aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
return ops[0].value;
}
+/* Convert ptr_mode value OP to a Pmode value (for ILP32). */
+static void
+aarch64_convert_address (expand_operand *op)
+{
+ op->value = convert_memory_address (Pmode, op->value);
+ op->mode = Pmode;
+}
+
+/* Dereference the pointer in OP, turning it into a memory reference to
+ NELTS instances of MEM_MODE. */
+static void
+aarch64_dereference_pointer (expand_operand *op, machine_mode mem_mode,
+ unsigned int nelts = 1)
+{
+ if (nelts == 1)
+ {
+ op->value = gen_rtx_MEM (mem_mode, op->value);
+ op->mode = mem_mode;
+ }
+ else
+ {
+ op->value = gen_rtx_MEM (BLKmode, op->value);
+ op->mode = BLKmode;
+ set_mem_size (op->value, GET_MODE_SIZE (mem_mode) * nelts);
+ }
+}
+
+/* OP contains an integer index into a vector or tuple of mode VEC_MODE.
+ Convert OP from an architectural lane number to a GCC lane number. */
+static void
+aarch64_canonicalize_lane (expand_operand *op, machine_mode vec_mode)
+{
+ auto nunits = aarch64_num_lanes (vec_mode);
+ op->value = gen_int_mode (ENDIAN_LANE_N (nunits, UINTVAL (op->value)),
+ SImode);
+}
+
+/* OP contains an integer index into a vector or tuple of mode VEC_MODE.
+ Convert OP from an architectural lane number to a vec_merge mask. */
+static void
+aarch64_convert_to_lane_mask (expand_operand *op, machine_mode vec_mode)
+{
+ auto nunits = aarch64_num_lanes (vec_mode);
+ create_integer_operand (op, 1 << ENDIAN_LANE_N (nunits, INTVAL (op->value)));
+}
+
/* If OP is a 128-bit vector, convert it to the equivalent 64-bit vector.
Do nothing otherwise. */
static void
@@ -3634,6 +3748,56 @@ aarch64_convert_to_v64 (expand_operand *op)
}
}
+/* If OP is a 64-bit (half-register) vector or a structure of 64-bit vectors,
+ pack its contents into the smallest associated full-register mode,
+ padding with zeros if necessary. Return true if padding was used. */
+static bool
+aarch64_pack_into_v128s (expand_operand *op)
+{
+ bool padded = false;
+ unsigned int nregs = targetm.hard_regno_nregs (V0_REGNUM, op->mode);
+
+ /* Do nothing if the operand is already a full-register mode. */
+ if (known_eq (nregs * UNITS_PER_VREG, GET_MODE_SIZE (op->mode)))
+ return padded;
+
+ auto elt_mode = GET_MODE_INNER (op->mode);
+ auto v64_mode = aarch64_v64_mode (elt_mode).require ();
+ auto v128_mode = aarch64_v128_mode (elt_mode).require ();
+
+ auto new_mode = v128_mode;
+ if (nregs > 2)
+ new_mode = aarch64_advsimd_vector_array_mode (v128_mode, CEIL (nregs, 2))
+ .require ();
+
+ /* Get enough V64_MODE inputs to fill NEW_MDOE, which is made up of a
+ whole number of V128_MODEs. */
+ auto_vec<rtx, 4> inputs;
+ for (unsigned int i = 0; i < nregs; ++i)
+ {
+ rtx input = simplify_gen_subreg (v64_mode, op->value, op->mode,
+ i * GET_MODE_SIZE (v64_mode));
+ inputs.quick_push (input);
+ }
+ if (nregs & 1)
+ {
+ inputs.quick_push (CONST0_RTX (v64_mode));
+ padded = true;
+ }
+
+ /* Create a NEW_MODE register and build it up from individual V128_MODEs. */
+ op->mode = new_mode;
+ op->value = gen_reg_rtx (new_mode);
+ for (unsigned int i = 0; i < inputs.length (); i += 2)
+ {
+ rtx result = gen_rtx_SUBREG (v128_mode, op->value,
+ i * GET_MODE_SIZE (v64_mode));
+ emit_insn (gen_aarch64_combine (v64_mode, result,
+ inputs[i], inputs[i + 1]));
+ }
+ return padded;
+}
+
/* UNSPEC is a high unspec, indicated by "2" in mnemonics and "_high" in
intrinsic names. Return the equivalent low unspec. */
static int
@@ -3652,6 +3816,88 @@ aarch64_get_low_unspec (int unspec)
}
}
+/* OPS contains the operands for one of the permute pair functions vtrn,
+ vuzp or vzip. Expand the call, given that PERMUTE1 is the unspec for
+ the first permute and PERMUTE2 is the unspec for the second permute. */
+static rtx
+aarch64_expand_permute_pair (vec<expand_operand> &ops, int permute1,
+ int permute2)
+{
+ rtx op0 = force_reg (ops[1].mode, ops[1].value);
+ rtx op1 = force_reg (ops[2].mode, ops[2].value);
+ rtx target = gen_reg_rtx (ops[0].mode);
+ rtx target0 = gen_rtx_SUBREG (ops[1].mode, target, 0);
+ rtx target1 = gen_rtx_SUBREG (ops[1].mode, target,
+ GET_MODE_SIZE (ops[1].mode));
+ emit_insn (gen_aarch64 (permute1, ops[1].mode, target0, op0, op1));
+ emit_insn (gen_aarch64 (permute2, ops[1].mode, target1, op0, op1));
+ return target;
+}
+
+/* Emit a TBL or TBX instruction with inputs INPUTS and a result of mode
+ MODE. Return the result of the instruction.
+
+ UNSPEC is either UNSPEC_TBL or UNSPEC_TBX. The inputs must already be in
+ registers. */
+static rtx
+aarch64_expand_tbl_tbx (vec<rtx> &inputs, int unspec, machine_mode mode)
+{
+ rtx result = gen_reg_rtx (mode);
+ rtvec vec = gen_rtvec_v (inputs.length (), inputs.address ());
+ emit_insn (gen_rtx_SET (result, gen_rtx_UNSPEC (mode, vec, unspec)));
+ return result;
+}
+
+/* Emit a TBL or TBX intrinsic with the operands given by OPS. Return the
+ result of the intrinsic.
+
+ UNSPEC is either UNSPEC_TBL or UNSPEC_TBX. */
+static rtx
+aarch64_expand_tbl_tbx (vec<expand_operand> &ops, int unspec)
+{
+ for (unsigned int i = 1; i < ops.length (); ++i)
+ ops[i].value = force_reg (ops[i].mode, ops[i].value);
+
+ /* Handle the legacy forms for which the table is composed of 64-bit
+ rather than 128-bit vectors. */
+ auto &table = ops[ops.length () - 2];
+ auto table_nelts = GET_MODE_NUNITS (table.mode);
+ bool padded = aarch64_pack_into_v128s (&table);
+
+ /* Packing to 128-bit vectors is enough for everything except the 64-bit
+ forms of vtbx1 and vtbx3, where we need to handle the zero padding. */
+ if (unspec == UNSPEC_TBL || !padded)
+ {
+ auto_vec<rtx, 3> inputs;
+ for (unsigned int i = 1; i < ops.length (); ++i)
+ inputs.quick_push (ops[i].value);
+ return aarch64_expand_tbl_tbx (inputs, unspec, ops[0].mode);
+ }
+
+ /* Generate a TBL, which will give the right results for indices that
+ are less than TABLE_NELTS. */
+ auto_vec<rtx, 2> inputs;
+ for (unsigned int i = 2; i < ops.length (); ++i)
+ inputs.quick_push (ops[i].value);
+ rtx tbl_result = aarch64_expand_tbl_tbx (inputs, UNSPEC_TBL, ops[0].mode);
+
+ /* Get a mask of the indices that are less than TABLE_NELTS. */
+ auto &indices = ops.last ();
+ rtx cmp_result = gen_reg_rtx (indices.mode);
+ rtx bound_rtx = gen_int_mode (table_nelts, GET_MODE_INNER (indices.mode));
+ rtx bound_vec_rtx = gen_const_vec_duplicate (indices.mode, bound_rtx);
+ emit_insn (gen_aarch64_cm (GTU, indices.mode, cmp_result,
+ force_reg (indices.mode, bound_vec_rtx),
+ indices.value));
+
+ /* Select from the TBL result if the index is less than TABLE_NELTS
+ and from OPS[1] otherwise. */
+ rtx result = gen_reg_rtx (ops[0].mode);
+ auto icode = get_vcond_mask_icode (ops[0].mode, indices.mode);
+ emit_insn (GEN_FCN (icode) (result, tbl_result, ops[1].value, cmp_result));
+ return result;
+}
+
/* Expand CALL_EXPR EXP, given that it is a call to the function described
by BUILTIN_DATA, and return the function's return value. Put the result
in TARGET if convenient. */
@@ -3660,15 +3906,19 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
const aarch64_pragma_builtins_data &builtin_data)
{
unsigned int nargs = call_expr_nargs (exp);
+ bool returns_void = VOID_TYPE_P (TREE_TYPE (exp));
auto_vec<expand_operand, 8> ops;
- ops.safe_grow (nargs + 1);
- create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (exp)));
- for (unsigned int i = 1; i <= nargs; ++i)
+ if (!returns_void)
+ create_output_operand (ops.safe_push ({}), target,
+ TYPE_MODE (TREE_TYPE (exp)));
+ for (unsigned int i = 0; i < nargs; ++i)
{
- tree arg = CALL_EXPR_ARG (exp, i - 1);
- create_input_operand (&ops[i], expand_normal (arg),
+ tree arg = CALL_EXPR_ARG (exp, i);
+ create_input_operand (ops.safe_push ({}), expand_normal (arg),
TYPE_MODE (TREE_TYPE (arg)));
+ if (POINTER_TYPE_P (TREE_TYPE (arg)))
+ aarch64_convert_address (&ops.last ());
}
if (builtin_data.flags & FLAG_USES_FPMR)
@@ -3698,12 +3948,43 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
insn_code icode;
switch (builtin_data.unspec)
{
+ case UNSPEC_BSL:
+ icode = code_for_aarch64_simd_bsl (ops[0].mode);
+ break;
+
+ case UNSPEC_COMBINE:
+ icode = code_for_aarch64_combine (ops[1].mode);
+ break;
+
+ case UNSPEC_DUP:
+ if (builtin_data.signature == aarch64_builtin_signatures::load)
+ aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode));
+ return expand_vector_broadcast (ops[0].mode, ops[1].value);
+
+ case UNSPEC_DUP_LANE:
+ aarch64_canonicalize_lane (&ops[2], ops[1].mode);
+ if (ops[0].mode == ops[1].mode)
+ icode = code_for_aarch64_dup_lane (ops[0].mode);
+ else
+ icode = code_for_aarch64_dup_lane (ops[0].mode, ops[0].mode);
+ break;
+
+ case UNSPEC_EXT:
+ icode = code_for_aarch64_ext (ops[0].mode);
+ break;
+
case UNSPEC_FAMAX:
case UNSPEC_FAMIN:
case UNSPEC_F1CVTL_FP8:
case UNSPEC_F2CVTL_FP8:
case UNSPEC_FDOT_FP8:
case UNSPEC_FSCALE:
+ case UNSPEC_TRN1:
+ case UNSPEC_TRN2:
+ case UNSPEC_UZP1:
+ case UNSPEC_UZP2:
+ case UNSPEC_ZIP1:
+ case UNSPEC_ZIP2:
icode = code_for_aarch64 (builtin_data.unspec, ops[0].mode);
break;
@@ -3737,6 +4018,7 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
}
case UNSPEC_FDOT_LANE_FP8:
+ /* This pattern does not canonicalize the lane number. */
icode = code_for_aarch64_lane (builtin_data.unspec,
ops[0].mode, ops[3].mode);
break;
@@ -3749,8 +4031,7 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
case UNSPEC_FMLALLTT_FP8:
if (builtin_data.signature == aarch64_builtin_signatures::ternary_lane)
{
- ops[4].value = aarch64_endian_lane_rtx (ops[3].mode,
- INTVAL (ops[4].value));
+ aarch64_canonicalize_lane (&ops[4], ops[3].mode);
icode = code_for_aarch64_lane (builtin_data.unspec,
ops[0].mode, ops[3].mode);
}
@@ -3760,6 +4041,55 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
gcc_unreachable ();
break;
+ case UNSPEC_GET_LANE:
+ aarch64_canonicalize_lane (&ops[2], ops[1].mode);
+ icode = code_for_aarch64_get_lane (ops[1].mode);
+ break;
+
+ case UNSPEC_LD1:
+ icode = code_for_aarch64_ld1 (ops[0].mode);
+ break;
+
+ case UNSPEC_LD1x2:
+ icode = code_for_aarch64_ld1x2 (ops[0].mode);
+ break;
+
+ case UNSPEC_LD1x3:
+ icode = code_for_aarch64_ld1x3 (ops[0].mode);
+ break;
+
+ case UNSPEC_LD1x4:
+ icode = code_for_aarch64_ld1x4 (ops[0].mode);
+ break;
+
+ case UNSPEC_LD2:
+ case UNSPEC_LD3:
+ case UNSPEC_LD4:
+ icode = code_for_aarch64_ld (ops[0].mode, ops[0].mode);
+ break;
+
+ case UNSPEC_LD2_DUP:
+ aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode), 2);
+ icode = code_for_aarch64_simd_ld2r (ops[0].mode);
+ break;
+
+ case UNSPEC_LD3_DUP:
+ aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode), 3);
+ icode = code_for_aarch64_simd_ld3r (ops[0].mode);
+ break;
+
+ case UNSPEC_LD4_DUP:
+ aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode), 4);
+ icode = code_for_aarch64_simd_ld4r (ops[0].mode);
+ break;
+
+ case UNSPEC_LD2_LANE:
+ case UNSPEC_LD3_LANE:
+ case UNSPEC_LD4_LANE:
+ aarch64_canonicalize_lane (&ops[3], ops[2].mode);
+ icode = code_for_aarch64_ld_lane (ops[0].mode, ops[0].mode);
+ break;
+
case UNSPEC_LUTI2:
case UNSPEC_LUTI4:
create_integer_operand (ops.safe_push ({}),
@@ -3767,6 +4097,86 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
icode = code_for_aarch64_lut (ops[1].mode, ops[2].mode);
break;
+ case UNSPEC_REV16:
+ case UNSPEC_REV32:
+ case UNSPEC_REV64:
+ icode = code_for_aarch64_rev (builtin_data.unspec, ops[0].mode);
+ break;
+
+ case UNSPEC_SET_LANE:
+ if (builtin_data.signature == aarch64_builtin_signatures::load_lane)
+ aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode));
+ /* The vec_set operand order is: dest, scalar, mask, vector. */
+ std::swap (ops[2], ops[3]);
+ aarch64_convert_to_lane_mask (&ops[2], ops[3].mode);
+ icode = code_for_aarch64_simd_vec_set (ops[0].mode);
+ break;
+
+ case UNSPEC_ST1:
+ icode = code_for_aarch64_st1 (ops[1].mode);
+ break;
+
+ case UNSPEC_ST1_LANE:
+ aarch64_dereference_pointer (&ops[0], GET_MODE_INNER (ops[1].mode));
+ /* Reinterpret ops[0] as an output. */
+ create_fixed_operand (&ops[0], ops[0].value);
+ aarch64_canonicalize_lane (&ops[2], ops[1].mode);
+ icode = code_for_aarch64_get_lane (ops[1].mode);
+ break;
+
+ case UNSPEC_ST1x2:
+ icode = code_for_aarch64_st1x2 (ops[1].mode);
+ break;
+
+ case UNSPEC_ST1x3:
+ icode = code_for_aarch64_st1x3 (ops[1].mode);
+ break;
+
+ case UNSPEC_ST1x4:
+ icode = code_for_aarch64_st1x4 (ops[1].mode);
+ break;
+
+ case UNSPEC_ST2:
+ case UNSPEC_ST3:
+ case UNSPEC_ST4:
+ icode = code_for_aarch64_st (ops[1].mode, ops[1].mode);
+ break;
+
+ case UNSPEC_ST2_LANE:
+ case UNSPEC_ST3_LANE:
+ case UNSPEC_ST4_LANE:
+ aarch64_canonicalize_lane (&ops[2], ops[1].mode);
+ icode = code_for_aarch64_st_lane (ops[1].mode, ops[1].mode);
+ break;
+
+ case UNSPEC_TBL:
+ case UNSPEC_TBX:
+ return aarch64_expand_tbl_tbx (ops, builtin_data.unspec);
+
+ case UNSPEC_TRN:
+ return aarch64_expand_permute_pair (ops, UNSPEC_TRN1, UNSPEC_TRN2);
+
+ case UNSPEC_UZP:
+ return aarch64_expand_permute_pair (ops, UNSPEC_UZP1, UNSPEC_UZP2);
+
+ case UNSPEC_VCREATE:
+ return force_lowpart_subreg (ops[0].mode, ops[1].value, ops[1].mode);
+
+ case UNSPEC_VEC_COPY:
+ {
+ aarch64_convert_to_lane_mask (&ops[2], ops[1].mode);
+ aarch64_canonicalize_lane (&ops[4], ops[3].mode);
+ if (ops[1].mode == ops[3].mode)
+ icode = code_for_aarch64_simd_vec_copy_lane (ops[1].mode);
+ else
+ icode = code_for_aarch64_simd_vec_copy_lane (ops[1].mode,
+ ops[1].mode);
+ break;
+ }
+
+ case UNSPEC_ZIP:
+ return aarch64_expand_permute_pair (ops, UNSPEC_ZIP1, UNSPEC_ZIP2);
+
default:
gcc_unreachable ();
}
@@ -4214,12 +4624,346 @@ aarch64_record_vector_load_arg (tree addr)
cfun->machine->vector_load_decls->add (decl);
}
+/* Force VAL into a valid gimple value, creating a new SSA_NAME if
+ necessary. Insert any new statements before GSI. */
+static tree
+aarch64_force_gimple_val (gimple_stmt_iterator *gsi, tree val)
+{
+ if (is_gimple_val (val))
+ return val;
+
+ tree tmp = make_ssa_name (TREE_TYPE (val));
+ gsi_insert_before_without_update (gsi, gimple_build_assign (tmp, val),
+ GSI_SAME_STMT);
+ return tmp;
+}
+
+/* Copy vops from FROM to TO and return TO. */
+static gimple *
+aarch64_copy_vops (gimple *to, gimple *from)
+{
+ gimple_set_vuse (to, gimple_vuse (from));
+ gimple_set_vdef (to, gimple_vdef (from));
+ return to;
+}
+
+/* Fold STMT (at GSI) to VAL, with SEQ setting up the value of VAL.
+ Return the replacement statement. */
+static gimple *
+aarch64_fold_to_val (gcall *stmt, gimple_stmt_iterator *gsi,
+ gimple *seq, tree val)
+{
+ auto *assign = gimple_build_assign (gimple_call_lhs (stmt), val);
+ gimple_seq_add_stmt_without_update (&seq, assign);
+ gsi_replace_with_seq_vops (gsi, seq);
+ return assign;
+}
+
+/* Dereference pointer ADDR, giving a memory reference of type TYPE. */
+static tree
+aarch64_dereference (tree addr, tree type)
+{
+ tree elt_type = (VECTOR_TYPE_P (type) ? TREE_TYPE (type) : type);
+ tree elt_ptr_type = build_pointer_type_for_mode (elt_type, VOIDmode, true);
+ tree zero = build_zero_cst (elt_ptr_type);
+ /* Use element type alignment. */
+ tree access_type = build_aligned_type (type, TYPE_ALIGN (elt_type));
+ return fold_build2 (MEM_REF, access_type, addr, zero);
+}
+
+/* LANE is a lane index into VEC. Return the associated bit index
+ (counting from the first byte in memory order). */
+static tree
+aarch64_get_lane_bit_index (tree vec, tree lane)
+{
+ auto vec_mode = TYPE_MODE (TREE_TYPE (vec));
+ auto nunits = aarch64_num_lanes (vec_mode);
+ auto idx = ENDIAN_LANE_N (nunits, tree_to_uhwi (lane));
+ return bitsize_int (idx * GET_MODE_UNIT_BITSIZE (vec_mode));
+}
+
+/* LANE is a lane index into VEC. Return a BIT_FIELD_REF for the
+ selected element. */
+static tree
+aarch64_get_lane (tree vec, tree lane)
+{
+ auto elt_type = TREE_TYPE (TREE_TYPE (vec));
+ return fold_build3 (BIT_FIELD_REF, elt_type, vec, TYPE_SIZE (elt_type),
+ aarch64_get_lane_bit_index (vec, lane));
+}
+
+/* LANE is a lane index into VEC. Return a BIT_INSERT_EXPR that replaces
+ that index with ELT and stores the result in LHS. */
+static gimple *
+aarch64_set_lane (tree lhs, tree elt, tree vec, tree lane)
+{
+ tree bit = aarch64_get_lane_bit_index (vec, lane);
+ return gimple_build_assign (lhs, BIT_INSERT_EXPR, vec, elt, bit);
+}
+
+/* Fold a call to vcombine. */
+static gimple *
+aarch64_fold_combine (gcall *stmt)
+{
+ tree first_part, second_part;
+ if (BYTES_BIG_ENDIAN)
+ {
+ second_part = gimple_call_arg (stmt, 0);
+ first_part = gimple_call_arg (stmt, 1);
+ }
+ else
+ {
+ first_part = gimple_call_arg (stmt, 0);
+ second_part = gimple_call_arg (stmt, 1);
+ }
+ tree ret_type = gimple_call_return_type (stmt);
+ tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, first_part,
+ NULL_TREE, second_part);
+ return gimple_build_assign (gimple_call_lhs (stmt), ctor);
+}
+
+/* Fold a call to vld1, given that it loads something of type TYPE. */
+static gimple *
+aarch64_fold_load (gcall *stmt, tree type)
+{
+ /* Punt until after inlining, so that we stand more chance of
+ recording something meaningful in vector_load_decls. */
+ if (!cfun->after_inlining)
+ return nullptr;
+ tree addr = gimple_call_arg (stmt, 0);
+ aarch64_record_vector_load_arg (addr);
+ if (!BYTES_BIG_ENDIAN)
+ {
+ tree mem = aarch64_dereference (addr, type);
+ auto *new_stmt = gimple_build_assign (gimple_get_lhs (stmt), mem);
+ return aarch64_copy_vops (new_stmt, stmt);
+ }
+ return nullptr;
+}
+
+/* Fold a call to vst1, given that it loads something of type TYPE. */
+static gimple *
+aarch64_fold_store (gcall *stmt, tree type)
+{
+ tree addr = gimple_call_arg (stmt, 0);
+ tree data = gimple_call_arg (stmt, 1);
+ if (!BYTES_BIG_ENDIAN)
+ {
+ tree mem = aarch64_dereference (addr, type);
+ auto *new_stmt = gimple_build_assign (mem, data);
+ return aarch64_copy_vops (new_stmt, stmt);
+ }
+ return nullptr;
+}
+
+/* An aarch64_fold_permute callback for vext. SELECTOR is the value of
+ the final argument. */
+static unsigned int
+aarch64_ext_index (unsigned int, unsigned int selector, unsigned int i)
+{
+ return selector + i;
+}
+
+/* An aarch64_fold_permute callback for vrev. SELECTOR is the number
+ of elements in each reversal group. */
+static unsigned int
+aarch64_rev_index (unsigned int, unsigned int selector, unsigned int i)
+{
+ return ROUND_DOWN (i, selector) + (selector - 1) - (i % selector);
+}
+
+/* An aarch64_fold_permute callback for vtrn. SELECTOR is 0 for TRN1
+ and 1 for TRN2. */
+static unsigned int
+aarch64_trn_index (unsigned int nelts, unsigned int selector, unsigned int i)
+{
+ return (i % 2) * nelts + ROUND_DOWN (i, 2) + selector;
+}
+
+/* An aarch64_fold_permute callback for vuzp. SELECTOR is 0 for UZP1
+ and 1 for UZP2. */
+static unsigned int
+aarch64_uzp_index (unsigned int, unsigned int selector, unsigned int i)
+{
+ return i * 2 + selector;
+}
+
+/* An aarch64_fold_permute callback for vzip. SELECTOR is 0 for ZIP1
+ and 1 for ZIP2. */
+static unsigned int
+aarch64_zip_index (unsigned int nelts, unsigned int selector, unsigned int i)
+{
+ return (i % 2) * nelts + (i / 2) + selector * (nelts / 2);
+}
+
+/* Fold STMT to a VEC_PERM_EXPR on the first NINPUTS arguments.
+ Make the VEC_PERM_EXPR emulate an NINPUTS-input TBL in which
+ architectural lane I of the result selects architectural lane:
+
+ GET_INDEX (NELTS, SELECTOR, I)
+
+ of the input table. NELTS is the number of elements in one vector. */
+static gimple *
+aarch64_fold_permute (gcall *stmt, unsigned int ninputs,
+ unsigned int (*get_index) (unsigned int, unsigned int,
+ unsigned int),
+ unsigned int selector)
+{
+ tree op0 = gimple_call_arg (stmt, 0);
+ tree op1 = ninputs == 2 ? gimple_call_arg (stmt, 1) : op0;
+ auto nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0)).to_constant ();
+ vec_perm_builder sel (nelts, nelts, 1);
+ for (unsigned int i = 0; i < nelts; ++i)
+ {
+ unsigned int index = get_index (nelts, selector,
+ ENDIAN_LANE_N (nelts, i));
+ unsigned int vec = index / nelts;
+ unsigned int elt = ENDIAN_LANE_N (nelts, index % nelts);
+ sel.quick_push (vec * nelts + elt);
+ }
+
+ vec_perm_indices indices (sel, ninputs, nelts);
+ tree mask_type = build_vector_type (ssizetype, nelts);
+ tree mask = vec_perm_indices_to_tree (mask_type, indices);
+ return gimple_build_assign (gimple_call_lhs (stmt), VEC_PERM_EXPR,
+ op0, op1, mask);
+}
+
+/* Try to fold STMT (at GSI), given that it is a call to the builtin
+ described by BUILTIN_DATA. Return the new statement on success,
+ otherwise return null. */
+static gimple *
+aarch64_gimple_fold_pragma_builtin
+ (gcall *stmt, gimple_stmt_iterator *gsi,
+ const aarch64_pragma_builtins_data &builtin_data)
+{
+ auto &types = builtin_data.types;
+
+ switch (builtin_data.unspec)
+ {
+ case UNSPEC_COMBINE:
+ return aarch64_fold_combine (stmt);
+
+ case UNSPEC_DUP:
+ case UNSPEC_DUP_LANE:
+ {
+ tree arg = gimple_call_arg (stmt, 0);
+ tree type = types[0].type ();
+ if (builtin_data.signature == aarch64_builtin_signatures::load)
+ arg = aarch64_dereference (arg, TREE_TYPE (type));
+ else if (builtin_data.unspec == UNSPEC_DUP_LANE)
+ arg = aarch64_get_lane (arg, gimple_call_arg (stmt, 1));
+ arg = aarch64_force_gimple_val (gsi, arg);
+
+ tree dup = build_vector_from_val (type, arg);
+ return aarch64_fold_to_val (stmt, gsi, nullptr, dup);
+ }
+
+ case UNSPEC_EXT:
+ {
+ auto index = tree_to_uhwi (gimple_call_arg (stmt, 2));
+ return aarch64_fold_permute (stmt, 2, aarch64_ext_index, index);
+ }
+
+ case UNSPEC_GET_LANE:
+ {
+ tree val = aarch64_get_lane (gimple_call_arg (stmt, 0),
+ gimple_call_arg (stmt, 1));
+ return gimple_build_assign (gimple_call_lhs (stmt), val);
+ }
+
+ case UNSPEC_LD1:
+ return aarch64_fold_load (stmt, types[0].type ());
+
+ case UNSPEC_REV16:
+ {
+ auto selector = 16 / GET_MODE_UNIT_BITSIZE (types[0].mode);
+ return aarch64_fold_permute (stmt, 1, aarch64_rev_index, selector);
+ }
+
+ case UNSPEC_REV32:
+ {
+ auto selector = 32 / GET_MODE_UNIT_BITSIZE (types[0].mode);
+ return aarch64_fold_permute (stmt, 1, aarch64_rev_index, selector);
+ }
+
+ case UNSPEC_REV64:
+ {
+ auto selector = 64 / GET_MODE_UNIT_BITSIZE (types[0].mode);
+ return aarch64_fold_permute (stmt, 1, aarch64_rev_index, selector);
+ }
+
+ case UNSPEC_SET_LANE:
+ {
+ tree elt = gimple_call_arg (stmt, 0);
+ if (builtin_data.signature == aarch64_builtin_signatures::load_lane)
+ {
+ elt = aarch64_dereference (elt, TREE_TYPE (types[0].type ()));
+ elt = aarch64_force_gimple_val (gsi, elt);
+ }
+ return aarch64_set_lane (gimple_call_lhs (stmt), elt,
+ gimple_call_arg (stmt, 1),
+ gimple_call_arg (stmt, 2));
+ }
+
+ case UNSPEC_ST1:
+ return aarch64_fold_store (stmt, types[1].type ());
+
+ case UNSPEC_ST1_LANE:
+ {
+ tree val = aarch64_get_lane (gimple_call_arg (stmt, 1),
+ gimple_call_arg (stmt, 2));
+ tree mem = aarch64_dereference (gimple_call_arg (stmt, 0),
+ TREE_TYPE (types[0].type ()));
+ val = aarch64_force_gimple_val (gsi, val);
+ return aarch64_copy_vops (gimple_build_assign (mem, val), stmt);
+ }
+
+ case UNSPEC_TRN1:
+ return aarch64_fold_permute (stmt, 2, aarch64_trn_index, 0);
+
+ case UNSPEC_TRN2:
+ return aarch64_fold_permute (stmt, 2, aarch64_trn_index, 1);
+
+ case UNSPEC_UZP1:
+ return aarch64_fold_permute (stmt, 2, aarch64_uzp_index, 0);
+
+ case UNSPEC_UZP2:
+ return aarch64_fold_permute (stmt, 2, aarch64_uzp_index, 1);
+
+ case UNSPEC_VCREATE:
+ return gimple_build_assign (gimple_call_lhs (stmt),
+ fold_build1 (VIEW_CONVERT_EXPR,
+ types[0].type (),
+ gimple_call_arg (stmt, 0)));
+
+ case UNSPEC_VEC_COPY:
+ {
+ tree elt = aarch64_get_lane (gimple_call_arg (stmt, 2),
+ gimple_call_arg (stmt, 3));
+ elt = aarch64_force_gimple_val (gsi, elt);
+ return aarch64_set_lane (gimple_call_lhs (stmt), elt,
+ gimple_call_arg (stmt, 0),
+ gimple_call_arg (stmt, 1));
+ }
+
+ case UNSPEC_ZIP1:
+ return aarch64_fold_permute (stmt, 2, aarch64_zip_index, 0);
+
+ case UNSPEC_ZIP2:
+ return aarch64_fold_permute (stmt, 2, aarch64_zip_index, 1);
+
+ default:
+ return nullptr;
+ }
+}
+
/* Try to fold STMT, given that it's a call to the built-in function with
subcode FCODE. Return the new statement on success and null on
failure. */
gimple *
aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
- gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED)
+ gimple_stmt_iterator *gsi)
{
gimple *new_stmt = NULL;
unsigned nargs = gimple_call_num_args (stmt);
@@ -4249,81 +4993,33 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
BUILTIN_VDC (BINOP, combine, 0, QUIET)
BUILTIN_VD_I (BINOPU, combine, 0, DEFAULT)
BUILTIN_VDC_P (BINOPP, combine, 0, DEFAULT)
- {
- tree first_part, second_part;
- if (BYTES_BIG_ENDIAN)
- {
- second_part = args[0];
- first_part = args[1];
- }
- else
- {
- first_part = args[0];
- second_part = args[1];
- }
- tree ret_type = gimple_call_return_type (stmt);
- tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, first_part,
- NULL_TREE, second_part);
- new_stmt = gimple_build_assign (gimple_call_lhs (stmt), ctor);
- }
- break;
+ new_stmt = aarch64_fold_combine (stmt);
+ break;
/*lower store and load neon builtins to gimple. */
BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
- /* Punt until after inlining, so that we stand more chance of
- recording something meaningful in vector_load_decls. */
- if (!cfun->after_inlining)
- break;
- aarch64_record_vector_load_arg (args[0]);
- if (!BYTES_BIG_ENDIAN)
- {
- enum aarch64_simd_type mem_type
- = get_mem_type_for_load_store(fcode);
- aarch64_simd_type_info_trees simd_type
- = aarch64_simd_types_trees[mem_type];
- tree elt_ptr_type = build_pointer_type_for_mode (simd_type.eltype,
- VOIDmode, true);
- tree zero = build_zero_cst (elt_ptr_type);
- /* Use element type alignment. */
- tree access_type
- = build_aligned_type (simd_type.itype,
- TYPE_ALIGN (simd_type.eltype));
- new_stmt
- = gimple_build_assign (gimple_get_lhs (stmt),
- fold_build2 (MEM_REF,
- access_type,
- args[0], zero));
- gimple_set_vuse (new_stmt, gimple_vuse (stmt));
- gimple_set_vdef (new_stmt, gimple_vdef (stmt));
- }
- break;
+ {
+ enum aarch64_simd_type mem_type
+ = get_mem_type_for_load_store (fcode);
+ aarch64_simd_type_info_trees simd_type
+ = aarch64_simd_types_trees[mem_type];
+ new_stmt = aarch64_fold_load (stmt, simd_type.itype);
+ break;
+ }
BUILTIN_VALL_F16 (STORE1, st1, 0, STORE)
BUILTIN_VDQ_I (STORE1_U, st1, 0, STORE)
BUILTIN_VALLP_NO_DI (STORE1_P, st1, 0, STORE)
- if (!BYTES_BIG_ENDIAN)
- {
- enum aarch64_simd_type mem_type
- = get_mem_type_for_load_store(fcode);
- aarch64_simd_type_info_trees simd_type
- = aarch64_simd_types_trees[mem_type];
- tree elt_ptr_type = build_pointer_type_for_mode (simd_type.eltype,
- VOIDmode, true);
- tree zero = build_zero_cst (elt_ptr_type);
- /* Use element type alignment. */
- tree access_type
- = build_aligned_type (simd_type.itype,
- TYPE_ALIGN (simd_type.eltype));
- new_stmt
- = gimple_build_assign (fold_build2 (MEM_REF, access_type,
- args[0], zero),
- args[1]);
- gimple_set_vuse (new_stmt, gimple_vuse (stmt));
- gimple_set_vdef (new_stmt, gimple_vdef (stmt));
- }
- break;
+ {
+ enum aarch64_simd_type mem_type
+ = get_mem_type_for_load_store (fcode);
+ aarch64_simd_type_info_trees simd_type
+ = aarch64_simd_types_trees[mem_type];
+ new_stmt = aarch64_fold_store (stmt, simd_type.itype);
+ break;
+ }
BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10, ALL)
BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
@@ -4440,6 +5136,9 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
}
break;
default:
+ if (auto builtin_data = aarch64_get_pragma_builtin (fcode))
+ new_stmt = aarch64_gimple_fold_pragma_builtin (stmt, gsi,
+ *builtin_data);
break;
}
diff --git a/gcc/config/aarch64/aarch64-builtins.h b/gcc/config/aarch64/aarch64-builtins.h
index f4d54de..d998370 100644
--- a/gcc/config/aarch64/aarch64-builtins.h
+++ b/gcc/config/aarch64/aarch64-builtins.h
@@ -28,6 +28,8 @@ enum aarch64_type_qualifiers
qualifier_const = 0x2, /* 1 << 1 */
/* T *foo. */
qualifier_pointer = 0x4, /* 1 << 2 */
+ /* const T *foo. */
+ qualifier_const_pointer = 0x6,
/* Used when expanding arguments if an operand could
be an immediate. */
qualifier_immediate = 0x8, /* 1 << 3 */
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 18764e4..21c7e67 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -896,6 +896,8 @@ bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
machine_mode aarch64_sve_int_mode (machine_mode);
opt_machine_mode aarch64_sve_pred_mode (unsigned int);
machine_mode aarch64_sve_pred_mode (machine_mode);
+opt_machine_mode aarch64_advsimd_vector_array_mode (machine_mode,
+ unsigned HOST_WIDE_INT);
opt_machine_mode aarch64_sve_data_mode (scalar_mode, poly_uint64);
bool aarch64_sve_mode_p (machine_mode);
HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index 8924262..e725b52 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -26,6 +26,26 @@
#define ENTRY_BINARY_LANE(N, T0, T1, T2, U, F) \
ENTRY (N, binary_lane, T0, T1, T2, none, U, F)
+#undef ENTRY_BINARY_TWO_LANES
+#define ENTRY_BINARY_TWO_LANES(N, T0, T1, T2, U, F) \
+ ENTRY (N, binary_two_lanes, T0, T1, T2, none, U, F)
+
+#undef ENTRY_LOAD
+#define ENTRY_LOAD(N, T0, T1, U) \
+ ENTRY (N, load, T0, T1, none, none, U, LOAD)
+
+#undef ENTRY_LOAD_LANE
+#define ENTRY_LOAD_LANE(N, T0, T1, T2, U) \
+ ENTRY (N, load_lane, T0, T1, T2, none, U, LOAD)
+
+#undef ENTRY_STORE
+#define ENTRY_STORE(N, T0, T1, U) \
+ ENTRY (N, store, T0, T1, none, none, U, STORE)
+
+#undef ENTRY_STORE_LANE
+#define ENTRY_STORE_LANE(N, T0, T1, U) \
+ ENTRY (N, store_lane, T0, T1, none, none, U, STORE)
+
#undef ENTRY_TERNARY
#define ENTRY_TERNARY(N, T0, T1, T2, T3, U, F) \
ENTRY (N, ternary, T0, T1, T2, T3, U, F)
@@ -38,6 +58,10 @@
#define ENTRY_UNARY(N, T0, T1, U, F) \
ENTRY (N, unary, T0, T1, none, none, U, F)
+#undef ENTRY_UNARY_LANE
+#define ENTRY_UNARY_LANE(N, T0, T1, U, F) \
+ ENTRY (N, unary_lane, T0, T1, none, none, U, F)
+
#undef ENTRY_BINARY_VHSDF
#define ENTRY_BINARY_VHSDF(NAME, UNSPEC, FLAGS) \
ENTRY_BINARY (NAME##_f16, f16, f16, f16, UNSPEC, FLAGS) \
@@ -121,6 +145,7 @@ ENTRY_BINARY_VHSDF (vamin, UNSPEC_FAMIN, FP)
ENTRY_TERNARY_VLUT8 (p)
ENTRY_TERNARY_VLUT8 (s)
ENTRY_TERNARY_VLUT8 (u)
+ENTRY_TERNARY_VLUT8 (mf)
ENTRY_TERNARY_VLUT16 (bf)
ENTRY_TERNARY_VLUT16 (f)
@@ -170,3 +195,224 @@ ENTRY_FMA_FPM (vmlallbt, f32, UNSPEC_FMLALLBT_FP8)
ENTRY_FMA_FPM (vmlalltb, f32, UNSPEC_FMLALLTB_FP8)
ENTRY_FMA_FPM (vmlalltt, f32, UNSPEC_FMLALLTT_FP8)
#undef REQUIRED_EXTENSIONS
+
+// bsl
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_TERNARY (vbsl_mf8, mf8, u8, mf8, mf8, UNSPEC_BSL, QUIET)
+ENTRY_TERNARY (vbslq_mf8, mf8q, u8q, mf8q, mf8q, UNSPEC_BSL, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// combine
+#define REQUIRED_EXTENSIONS nonstreaming_only (NONE)
+ENTRY_BINARY (vcombine_mf8, mf8q, mf8, mf8, UNSPEC_COMBINE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// copy_lane
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY_TWO_LANES (vcopy_lane_mf8, mf8, mf8, mf8,
+ UNSPEC_VEC_COPY, QUIET)
+ENTRY_BINARY_TWO_LANES (vcopyq_lane_mf8, mf8q, mf8q, mf8,
+ UNSPEC_VEC_COPY, QUIET)
+ENTRY_BINARY_TWO_LANES (vcopy_laneq_mf8, mf8, mf8, mf8q,
+ UNSPEC_VEC_COPY, QUIET)
+ENTRY_BINARY_TWO_LANES (vcopyq_laneq_mf8, mf8q, mf8q, mf8q,
+ UNSPEC_VEC_COPY, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// create
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY (vcreate_mf8, mf8, u64_scalar, UNSPEC_VCREATE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// dup
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY (vdup_n_mf8, mf8, mf8_scalar, UNSPEC_DUP, QUIET)
+ENTRY_UNARY (vdupq_n_mf8, mf8q, mf8_scalar, UNSPEC_DUP, QUIET)
+
+ENTRY_UNARY_LANE (vdup_lane_mf8, mf8, mf8, UNSPEC_DUP_LANE, QUIET)
+ENTRY_UNARY_LANE (vdupq_lane_mf8, mf8q, mf8, UNSPEC_DUP_LANE, QUIET)
+ENTRY_UNARY_LANE (vdup_laneq_mf8, mf8, mf8q, UNSPEC_DUP_LANE, QUIET)
+ENTRY_UNARY_LANE (vdupq_laneq_mf8, mf8q, mf8q, UNSPEC_DUP_LANE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// dupb_lane
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY_LANE (vdupb_lane_mf8, mf8_scalar, mf8, UNSPEC_GET_LANE, QUIET)
+ENTRY_UNARY_LANE (vdupb_laneq_mf8, mf8_scalar, mf8q, UNSPEC_GET_LANE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// ext
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY_LANE (vext_mf8, mf8, mf8, mf8, UNSPEC_EXT, QUIET)
+ENTRY_BINARY_LANE (vextq_mf8, mf8q, mf8q, mf8q, UNSPEC_EXT, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// ld1
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_LOAD (vld1_mf8, mf8, mf8_scalar_const_ptr, UNSPEC_LD1)
+ENTRY_LOAD (vld1q_mf8, mf8q, mf8_scalar_const_ptr, UNSPEC_LD1)
+ENTRY_LOAD (vld1_dup_mf8, mf8, mf8_scalar_const_ptr, UNSPEC_DUP)
+ENTRY_LOAD (vld1q_dup_mf8, mf8q, mf8_scalar_const_ptr, UNSPEC_DUP)
+
+ENTRY_LOAD_LANE (vld1_lane_mf8, mf8, mf8_scalar_const_ptr, mf8,
+ UNSPEC_SET_LANE)
+ENTRY_LOAD_LANE (vld1q_lane_mf8, mf8q, mf8_scalar_const_ptr, mf8q,
+ UNSPEC_SET_LANE)
+#undef REQUIRED_EXTENSIONS
+
+// ld<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_LOAD (vld1_mf8_x2, mf8x2, mf8_scalar_const_ptr, UNSPEC_LD1x2)
+ENTRY_LOAD (vld1q_mf8_x2, mf8qx2, mf8_scalar_const_ptr, UNSPEC_LD1x2)
+ENTRY_LOAD (vld2_mf8, mf8x2, mf8_scalar_const_ptr, UNSPEC_LD2)
+ENTRY_LOAD (vld2q_mf8, mf8qx2, mf8_scalar_const_ptr, UNSPEC_LD2)
+ENTRY_LOAD (vld2_dup_mf8, mf8x2, mf8_scalar_const_ptr, UNSPEC_LD2_DUP)
+ENTRY_LOAD (vld2q_dup_mf8, mf8qx2, mf8_scalar_const_ptr, UNSPEC_LD2_DUP)
+ENTRY_LOAD_LANE (vld2_lane_mf8, mf8x2, mf8_scalar_const_ptr, mf8x2,
+ UNSPEC_LD2_LANE)
+ENTRY_LOAD_LANE (vld2q_lane_mf8, mf8qx2, mf8_scalar_const_ptr, mf8qx2,
+ UNSPEC_LD2_LANE)
+
+ENTRY_LOAD (vld1_mf8_x3, mf8x3, mf8_scalar_const_ptr, UNSPEC_LD1x3)
+ENTRY_LOAD (vld1q_mf8_x3, mf8qx3, mf8_scalar_const_ptr, UNSPEC_LD1x3)
+ENTRY_LOAD (vld3_mf8, mf8x3, mf8_scalar_const_ptr, UNSPEC_LD3)
+ENTRY_LOAD (vld3q_mf8, mf8qx3, mf8_scalar_const_ptr, UNSPEC_LD3)
+ENTRY_LOAD (vld3_dup_mf8, mf8x3, mf8_scalar_const_ptr, UNSPEC_LD3_DUP)
+ENTRY_LOAD (vld3q_dup_mf8, mf8qx3, mf8_scalar_const_ptr, UNSPEC_LD3_DUP)
+ENTRY_LOAD_LANE (vld3_lane_mf8, mf8x3, mf8_scalar_const_ptr, mf8x3,
+ UNSPEC_LD3_LANE)
+ENTRY_LOAD_LANE (vld3q_lane_mf8, mf8qx3, mf8_scalar_const_ptr, mf8qx3,
+ UNSPEC_LD3_LANE)
+
+ENTRY_LOAD (vld1_mf8_x4, mf8x4, mf8_scalar_const_ptr, UNSPEC_LD1x4)
+ENTRY_LOAD (vld1q_mf8_x4, mf8qx4, mf8_scalar_const_ptr, UNSPEC_LD1x4)
+ENTRY_LOAD (vld4_mf8, mf8x4, mf8_scalar_const_ptr, UNSPEC_LD4)
+ENTRY_LOAD (vld4q_mf8, mf8qx4, mf8_scalar_const_ptr, UNSPEC_LD4)
+ENTRY_LOAD (vld4_dup_mf8, mf8x4, mf8_scalar_const_ptr, UNSPEC_LD4_DUP)
+ENTRY_LOAD (vld4q_dup_mf8, mf8qx4, mf8_scalar_const_ptr, UNSPEC_LD4_DUP)
+ENTRY_LOAD_LANE (vld4_lane_mf8, mf8x4, mf8_scalar_const_ptr, mf8x4,
+ UNSPEC_LD4_LANE)
+ENTRY_LOAD_LANE (vld4q_lane_mf8, mf8qx4, mf8_scalar_const_ptr, mf8qx4,
+ UNSPEC_LD4_LANE)
+#undef REQUIRED_EXTENSIONS
+
+// mov
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY (vmov_n_mf8, mf8, mf8_scalar, UNSPEC_DUP, QUIET)
+ENTRY_UNARY (vmovq_n_mf8, mf8q, mf8_scalar, UNSPEC_DUP, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// rev
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY (vrev64_mf8, mf8, mf8, UNSPEC_REV64, QUIET)
+ENTRY_UNARY (vrev64q_mf8, mf8q, mf8q, UNSPEC_REV64, QUIET)
+
+ENTRY_UNARY (vrev32_mf8, mf8, mf8, UNSPEC_REV32, QUIET)
+ENTRY_UNARY (vrev32q_mf8, mf8q, mf8q, UNSPEC_REV32, QUIET)
+
+ENTRY_UNARY (vrev16_mf8, mf8, mf8, UNSPEC_REV16, QUIET)
+ENTRY_UNARY (vrev16q_mf8, mf8q, mf8q, UNSPEC_REV16, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// set_lane
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY_LANE (vset_lane_mf8, mf8, mf8_scalar, mf8, UNSPEC_SET_LANE, QUIET)
+ENTRY_BINARY_LANE (vsetq_lane_mf8, mf8q, mf8_scalar, mf8q, UNSPEC_SET_LANE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// st1
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_STORE (vst1_mf8, mf8_scalar_ptr, mf8, UNSPEC_ST1)
+ENTRY_STORE (vst1q_mf8, mf8_scalar_ptr, mf8q, UNSPEC_ST1)
+
+ENTRY_STORE_LANE (vst1_lane_mf8, mf8_scalar_ptr, mf8, UNSPEC_ST1_LANE)
+ENTRY_STORE_LANE (vst1q_lane_mf8, mf8_scalar_ptr, mf8q, UNSPEC_ST1_LANE)
+#undef REQUIRED_EXTENSIONS
+
+// st<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_STORE (vst2_mf8, mf8_scalar_ptr, mf8x2, UNSPEC_ST2)
+ENTRY_STORE (vst2q_mf8, mf8_scalar_ptr, mf8qx2, UNSPEC_ST2)
+ENTRY_STORE (vst1_mf8_x2, mf8_scalar_ptr, mf8x2, UNSPEC_ST1x2)
+ENTRY_STORE (vst1q_mf8_x2, mf8_scalar_ptr, mf8qx2, UNSPEC_ST1x2)
+ENTRY_STORE_LANE (vst2_lane_mf8, mf8_scalar_ptr, mf8x2, UNSPEC_ST2_LANE)
+ENTRY_STORE_LANE (vst2q_lane_mf8, mf8_scalar_ptr, mf8qx2, UNSPEC_ST2_LANE)
+
+ENTRY_STORE (vst3_mf8, mf8_scalar_ptr, mf8x3, UNSPEC_ST3)
+ENTRY_STORE (vst3q_mf8, mf8_scalar_ptr, mf8qx3, UNSPEC_ST3)
+ENTRY_STORE (vst1_mf8_x3, mf8_scalar_ptr, mf8x3, UNSPEC_ST1x3)
+ENTRY_STORE (vst1q_mf8_x3, mf8_scalar_ptr, mf8qx3, UNSPEC_ST1x3)
+ENTRY_STORE_LANE (vst3_lane_mf8, mf8_scalar_ptr, mf8x3, UNSPEC_ST3_LANE)
+ENTRY_STORE_LANE (vst3q_lane_mf8, mf8_scalar_ptr, mf8qx3, UNSPEC_ST3_LANE)
+
+ENTRY_STORE (vst4_mf8, mf8_scalar_ptr, mf8x4, UNSPEC_ST4)
+ENTRY_STORE (vst4q_mf8, mf8_scalar_ptr, mf8qx4, UNSPEC_ST4)
+ENTRY_STORE (vst1_mf8_x4, mf8_scalar_ptr, mf8x4, UNSPEC_ST1x4)
+ENTRY_STORE (vst1q_mf8_x4, mf8_scalar_ptr, mf8qx4, UNSPEC_ST1x4)
+ENTRY_STORE_LANE (vst4_lane_mf8, mf8_scalar_ptr, mf8x4, UNSPEC_ST4_LANE)
+ENTRY_STORE_LANE (vst4q_lane_mf8, mf8_scalar_ptr, mf8qx4, UNSPEC_ST4_LANE)
+#undef REQUIRED_EXTENSIONS
+
+// tbl<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY (vtbl1_mf8, mf8, mf8, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vtbl2_mf8, mf8, mf8x2, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vtbl3_mf8, mf8, mf8x3, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vtbl4_mf8, mf8, mf8x4, u8, UNSPEC_TBL, QUIET)
+
+ENTRY_BINARY (vqtbl1_mf8, mf8, mf8q, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl1q_mf8, mf8q, mf8q, u8q, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl2_mf8, mf8, mf8qx2, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl2q_mf8, mf8q, mf8qx2, u8q, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl3_mf8, mf8, mf8qx3, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl3q_mf8, mf8q, mf8qx3, u8q, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl4_mf8, mf8, mf8qx4, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl4q_mf8, mf8q, mf8qx4, u8q, UNSPEC_TBL, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// tbx<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_TERNARY (vtbx1_mf8, mf8, mf8, mf8, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vtbx2_mf8, mf8, mf8, mf8x2, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vtbx3_mf8, mf8, mf8, mf8x3, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vtbx4_mf8, mf8, mf8, mf8x4, u8, UNSPEC_TBX, QUIET)
+
+ENTRY_TERNARY (vqtbx1_mf8, mf8, mf8, mf8q, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx1q_mf8, mf8q, mf8q, mf8q, u8q, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx2_mf8, mf8, mf8, mf8qx2, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx2q_mf8, mf8q, mf8q, mf8qx2, u8q, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx3_mf8, mf8, mf8, mf8qx3, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx3q_mf8, mf8q, mf8q, mf8qx3, u8q, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx4_mf8, mf8, mf8, mf8qx4, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx4q_mf8, mf8q, mf8q, mf8qx4, u8q, UNSPEC_TBX, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// trn<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY (vtrn1_mf8, mf8, mf8, mf8, UNSPEC_TRN1, QUIET)
+ENTRY_BINARY (vtrn1q_mf8, mf8q, mf8q, mf8q, UNSPEC_TRN1, QUIET)
+ENTRY_BINARY (vtrn2_mf8, mf8, mf8, mf8, UNSPEC_TRN2, QUIET)
+ENTRY_BINARY (vtrn2q_mf8, mf8q, mf8q, mf8q, UNSPEC_TRN2, QUIET)
+ENTRY_BINARY (vtrn_mf8, mf8x2, mf8, mf8, UNSPEC_TRN, QUIET)
+ENTRY_BINARY (vtrnq_mf8, mf8qx2, mf8q, mf8q, UNSPEC_TRN, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// uzp<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY (vuzp1_mf8, mf8, mf8, mf8, UNSPEC_UZP1, QUIET)
+ENTRY_BINARY (vuzp1q_mf8, mf8q, mf8q, mf8q, UNSPEC_UZP1, QUIET)
+ENTRY_BINARY (vuzp2_mf8, mf8, mf8, mf8, UNSPEC_UZP2, QUIET)
+ENTRY_BINARY (vuzp2q_mf8, mf8q, mf8q, mf8q, UNSPEC_UZP2, QUIET)
+ENTRY_BINARY (vuzp_mf8, mf8x2, mf8, mf8, UNSPEC_UZP, QUIET)
+ENTRY_BINARY (vuzpq_mf8, mf8qx2, mf8q, mf8q, UNSPEC_UZP, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// zip<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY (vzip1_mf8, mf8, mf8, mf8, UNSPEC_ZIP1, QUIET)
+ENTRY_BINARY (vzip1q_mf8, mf8q, mf8q, mf8q, UNSPEC_ZIP1, QUIET)
+ENTRY_BINARY (vzip2_mf8, mf8, mf8, mf8, UNSPEC_ZIP2, QUIET)
+ENTRY_BINARY (vzip2q_mf8, mf8q, mf8q, mf8q, UNSPEC_ZIP2, QUIET)
+ENTRY_BINARY (vzip_mf8, mf8x2, mf8, mf8, UNSPEC_ZIP, QUIET)
+ENTRY_BINARY (vzipq_mf8, mf8qx2, mf8q, mf8q, UNSPEC_ZIP, QUIET)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7959cca..237de1b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -112,7 +112,7 @@
}
)
-(define_insn "aarch64_dup_lane<mode>"
+(define_insn "@aarch64_dup_lane<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
(vec_duplicate:VALL_F16
(vec_select:<VEL>
@@ -127,7 +127,7 @@
[(set_attr "type" "neon_dup<q>")]
)
-(define_insn "aarch64_dup_lane_<vswap_width_name><mode>"
+(define_insn "@aarch64_dup_lane_<vswap_width_name><mode>"
[(set (match_operand:VALL_F16_NO_V2Q 0 "register_operand" "=w")
(vec_duplicate:VALL_F16_NO_V2Q
(vec_select:<VEL>
@@ -1164,7 +1164,7 @@
[(set_attr "type" "neon_logic<q>")]
)
-(define_insn "aarch64_simd_vec_set<mode>"
+(define_insn "@aarch64_simd_vec_set<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
(vec_merge:VALL_F16
(vec_duplicate:VALL_F16
@@ -1225,7 +1225,7 @@
[(set_attr "type" "neon_ins<q>")]
)
-(define_insn "*aarch64_simd_vec_copy_lane_<vswap_width_name><mode>"
+(define_insn "@aarch64_simd_vec_copy_lane_<vswap_width_name><mode>"
[(set (match_operand:VALL_F16_NO_V2Q 0 "register_operand" "=w")
(vec_merge:VALL_F16_NO_V2Q
(vec_duplicate:VALL_F16_NO_V2Q
@@ -3837,7 +3837,7 @@
}
)
-(define_expand "aarch64_simd_bsl<mode>"
+(define_expand "@aarch64_simd_bsl<mode>"
[(match_operand:VALLDIF 0 "register_operand")
(match_operand:<V_INT_EQUIV> 1 "register_operand")
(match_operand:VALLDIF 2 "register_operand")
@@ -4438,7 +4438,7 @@
;; Form a vector whose least significant half comes from operand 1 and whose
;; most significant half comes from operand 2. This operand order follows
;; arm_neon.h vcombine* intrinsics.
-(define_expand "aarch64_combine<mode>"
+(define_expand "@aarch64_combine<mode>"
[(match_operand:<VDBL> 0 "register_operand")
(match_operand:VDC 1 "general_operand")
(match_operand:VDC 2 "general_operand")]
@@ -6971,7 +6971,7 @@
;; Note, we have constraints for Dz and Z as different expanders
;; have different ideas of what should be passed to this pattern.
-(define_insn "aarch64_cm<optab><mode><vczle><vczbe>"
+(define_insn "@aarch64_cm<optab><mode><vczle><vczbe>"
[(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
(neg:<V_INT_EQUIV>
(COMPARISONS:<V_INT_EQUIV>
@@ -7036,7 +7036,7 @@
;; cm(hs|hi)
-(define_insn "aarch64_cm<optab><mode><vczle><vczbe>"
+(define_insn "@aarch64_cm<optab><mode><vczle><vczbe>"
[(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
(neg:<V_INT_EQUIV>
(UCOMPARISONS:<V_INT_EQUIV>
@@ -7188,7 +7188,7 @@
;; fcm(eq|ge|gt|le|lt)
-(define_insn "aarch64_cm<optab><mode><vczle><vczbe>"
+(define_insn "@aarch64_cm<optab><mode><vczle><vczbe>"
[(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
(neg:<V_INT_EQUIV>
(COMPARISONS:<V_INT_EQUIV>
@@ -7349,7 +7349,7 @@
[(set_attr "type" "neon_load2_2reg<q>")]
)
-(define_insn "aarch64_simd_ld2r<vstruct_elt>"
+(define_insn "@aarch64_simd_ld2r<vstruct_elt>"
[(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
(unspec:VSTRUCT_2QD [
(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
@@ -7359,7 +7359,7 @@
[(set_attr "type" "neon_load2_all_lanes<q>")]
)
-(define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
+(define_insn "@aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
[(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
(unspec:VSTRUCT_2QD [
(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
@@ -7449,7 +7449,7 @@
[(set_attr "type" "neon_load3_3reg<q>")]
)
-(define_insn "aarch64_simd_ld3r<vstruct_elt>"
+(define_insn "@aarch64_simd_ld3r<vstruct_elt>"
[(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
(unspec:VSTRUCT_3QD [
(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
@@ -7549,7 +7549,7 @@
[(set_attr "type" "neon_load4_4reg<q>")]
)
-(define_insn "aarch64_simd_ld4r<vstruct_elt>"
+(define_insn "@aarch64_simd_ld4r<vstruct_elt>"
[(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
(unspec:VSTRUCT_4QD [
(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
@@ -7773,7 +7773,7 @@
operands[1] = force_reg (V8DImode, operands[1]);
})
-(define_expand "aarch64_ld1x3<vstruct_elt>"
+(define_expand "@aarch64_ld1x3<vstruct_elt>"
[(match_operand:VSTRUCT_3QD 0 "register_operand")
(match_operand:DI 1 "register_operand")]
"TARGET_SIMD"
@@ -7793,7 +7793,7 @@
[(set_attr "type" "neon_load1_3reg<q>")]
)
-(define_expand "aarch64_ld1x4<vstruct_elt>"
+(define_expand "@aarch64_ld1x4<vstruct_elt>"
[(match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
(match_operand:DI 1 "register_operand" "r")]
"TARGET_SIMD"
@@ -7813,7 +7813,7 @@
[(set_attr "type" "neon_load1_4reg<q>")]
)
-(define_expand "aarch64_st1x2<vstruct_elt>"
+(define_expand "@aarch64_st1x2<vstruct_elt>"
[(match_operand:DI 0 "register_operand")
(match_operand:VSTRUCT_2QD 1 "register_operand")]
"TARGET_SIMD"
@@ -7833,7 +7833,7 @@
[(set_attr "type" "neon_store1_2reg<q>")]
)
-(define_expand "aarch64_st1x3<vstruct_elt>"
+(define_expand "@aarch64_st1x3<vstruct_elt>"
[(match_operand:DI 0 "register_operand")
(match_operand:VSTRUCT_3QD 1 "register_operand")]
"TARGET_SIMD"
@@ -7853,7 +7853,7 @@
[(set_attr "type" "neon_store1_3reg<q>")]
)
-(define_expand "aarch64_st1x4<vstruct_elt>"
+(define_expand "@aarch64_st1x4<vstruct_elt>"
[(match_operand:DI 0 "register_operand" "")
(match_operand:VSTRUCT_4QD 1 "register_operand" "")]
"TARGET_SIMD"
@@ -8220,7 +8220,7 @@
[(set_attr "type" "neon_load1_4reg<q>")]
)
-(define_expand "aarch64_ld<nregs><vstruct_elt>"
+(define_expand "@aarch64_ld<nregs><vstruct_elt>"
[(match_operand:VSTRUCT_D 0 "register_operand")
(match_operand:DI 1 "register_operand")]
"TARGET_SIMD"
@@ -8230,7 +8230,7 @@
DONE;
})
-(define_expand "aarch64_ld1<VALL_F16:mode>"
+(define_expand "@aarch64_ld1<VALL_F16:mode>"
[(match_operand:VALL_F16 0 "register_operand")
(match_operand:DI 1 "register_operand")]
"TARGET_SIMD"
@@ -8245,7 +8245,7 @@
DONE;
})
-(define_expand "aarch64_ld<nregs><vstruct_elt>"
+(define_expand "@aarch64_ld<nregs><vstruct_elt>"
[(match_operand:VSTRUCT_Q 0 "register_operand")
(match_operand:DI 1 "register_operand")]
"TARGET_SIMD"
@@ -8255,7 +8255,7 @@
DONE;
})
-(define_expand "aarch64_ld1x2<vstruct_elt>"
+(define_expand "@aarch64_ld1x2<vstruct_elt>"
[(match_operand:VSTRUCT_2QD 0 "register_operand")
(match_operand:DI 1 "register_operand")]
"TARGET_SIMD"
@@ -8267,7 +8267,7 @@
DONE;
})
-(define_expand "aarch64_ld<nregs>_lane<vstruct_elt>"
+(define_expand "@aarch64_ld<nregs>_lane<vstruct_elt>"
[(match_operand:VSTRUCT_QD 0 "register_operand")
(match_operand:DI 1 "register_operand")
(match_operand:VSTRUCT_QD 2 "register_operand")
@@ -8411,7 +8411,7 @@
;; This instruction's pattern is generated directly by
;; aarch64_expand_vec_perm_const, so any changes to the pattern would
;; need corresponding changes there.
-(define_insn "aarch64_<PERMUTE:perm_insn><mode><vczle><vczbe>"
+(define_insn "@aarch64_<PERMUTE:perm_insn><mode><vczle><vczbe>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
(match_operand:VALL_F16 2 "register_operand" "w")]
@@ -8437,7 +8437,7 @@
;; aarch64_expand_vec_perm_const, so any changes to the pattern would
;; need corresponding changes there. Note that the immediate (third)
;; operand is a lane index not a byte index.
-(define_insn "aarch64_ext<mode>"
+(define_insn "@aarch64_ext<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
(match_operand:VALL_F16 2 "register_operand" "w")
@@ -8455,7 +8455,7 @@
;; This instruction's pattern is generated directly by
;; aarch64_expand_vec_perm_const, so any changes to the pattern would
;; need corresponding changes there.
-(define_insn "aarch64_rev<REVERSE:rev_op><mode><vczle><vczbe>"
+(define_insn "@aarch64_rev<REVERSE:rev_op><mode><vczle><vczbe>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")]
REVERSE))]
@@ -8524,7 +8524,7 @@
[(set_attr "type" "neon_store1_4reg")]
)
-(define_expand "aarch64_st<nregs><vstruct_elt>"
+(define_expand "@aarch64_st<nregs><vstruct_elt>"
[(match_operand:DI 0 "register_operand")
(match_operand:VSTRUCT_D 1 "register_operand")]
"TARGET_SIMD"
@@ -8534,7 +8534,7 @@
DONE;
})
-(define_expand "aarch64_st<nregs><vstruct_elt>"
+(define_expand "@aarch64_st<nregs><vstruct_elt>"
[(match_operand:DI 0 "register_operand")
(match_operand:VSTRUCT_Q 1 "register_operand")]
"TARGET_SIMD"
@@ -8544,7 +8544,7 @@
DONE;
})
-(define_expand "aarch64_st<nregs>_lane<vstruct_elt>"
+(define_expand "@aarch64_st<nregs>_lane<vstruct_elt>"
[(match_operand:DI 0 "register_operand")
(match_operand:VSTRUCT_QD 1 "register_operand")
(match_operand:SI 2 "immediate_operand")]
@@ -8560,7 +8560,7 @@
DONE;
})
-(define_expand "aarch64_st1<VALL_F16:mode>"
+(define_expand "@aarch64_st1<VALL_F16:mode>"
[(match_operand:DI 0 "register_operand")
(match_operand:VALL_F16 1 "register_operand")]
"TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 41cc2ee..6bb4bdf 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1802,7 +1802,7 @@ aarch64_ldn_stn_vectors (machine_mode mode)
/* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
corresponding vector structure mode. */
-static opt_machine_mode
+opt_machine_mode
aarch64_advsimd_vector_array_mode (machine_mode mode,
unsigned HOST_WIDE_INT nelems)
{
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index edac1ae..c62de38 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -198,8 +198,10 @@
UNSPEC_AUTIB1716
UNSPEC_AUTIASP
UNSPEC_AUTIBSP
+ UNSPEC_BSL
UNSPEC_CALLEE_ABI
UNSPEC_CASESI
+ UNSPEC_COMBINE
UNSPEC_CPYMEM
UNSPEC_CRC32B
UNSPEC_CRC32CB
@@ -209,6 +211,8 @@
UNSPEC_CRC32H
UNSPEC_CRC32W
UNSPEC_CRC32X
+ UNSPEC_DUP
+ UNSPEC_DUP_LANE
UNSPEC_FCVTZS
UNSPEC_FCVTZU
UNSPEC_FJCVTZS
@@ -227,6 +231,7 @@
UNSPEC_FRINTP
UNSPEC_FRINTX
UNSPEC_FRINTZ
+ UNSPEC_GET_LANE
UNSPEC_GOTSMALLPIC
UNSPEC_GOTSMALLPIC28K
UNSPEC_GOTSMALLTLS
@@ -236,6 +241,10 @@
UNSPEC_LDP_FST
UNSPEC_LDP_SND
UNSPEC_LD1
+ UNSPEC_LD1_DUP
+ UNSPEC_LD1x2
+ UNSPEC_LD1x3
+ UNSPEC_LD1x4
UNSPEC_LD2
UNSPEC_LD2_DREG
UNSPEC_LD2_DUP
@@ -265,12 +274,17 @@
UNSPEC_REV
UNSPEC_SADALP
UNSPEC_SCVTF
+ UNSPEC_SET_LANE
UNSPEC_SETMEM
UNSPEC_SISD_NEG
UNSPEC_SISD_SSHL
UNSPEC_SISD_USHL
UNSPEC_SSHL_2S
UNSPEC_ST1
+ UNSPEC_ST1_LANE
+ UNSPEC_ST1x2
+ UNSPEC_ST1x3
+ UNSPEC_ST1x4
UNSPEC_ST2
UNSPEC_ST3
UNSPEC_ST4
@@ -314,6 +328,8 @@
UNSPEC_UNPACKSLO
UNSPEC_UNPACKULO
UNSPEC_PACK
+ UNSPEC_VCREATE
+ UNSPEC_VEC_COPY
UNSPEC_WHILEGE
UNSPEC_WHILEGT
UNSPEC_WHILEHI
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 34200b0..07b9754 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1095,6 +1095,7 @@
UNSPEC_SUBHNB ; Used in aarch64-sve2.md.
UNSPEC_SUBHNT ; Used in aarch64-sve2.md.
UNSPEC_TBL2 ; Used in aarch64-sve2.md.
+ UNSPEC_TRN ; Used in aarch64-builtins.cc
UNSPEC_UABDLB ; Used in aarch64-sve2.md.
UNSPEC_UABDLT ; Used in aarch64-sve2.md.
UNSPEC_UADDLB ; Used in aarch64-sve2.md.