aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSaurabh Jha <saurabh.jha@arm.com>2024-12-03 09:54:01 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2024-12-03 09:54:01 +0000
commita07a2b8c9e7c2d123f0178875c9110eaf9770b7a (patch)
treeb7cf5b335a3069e2293051f403742045370c056c
parentf855bc3006f43a623bd6197a795f238e70a3f007 (diff)
downloadgcc-a07a2b8c9e7c2d123f0178875c9110eaf9770b7a.zip
gcc-a07a2b8c9e7c2d123f0178875c9110eaf9770b7a.tar.gz
gcc-a07a2b8c9e7c2d123f0178875c9110eaf9770b7a.tar.bz2
aarch64: Add support for AdvSIMD lut
The AArch64 FEAT_LUT extension is optional from Armv9.2-A and mandatory from Armv9.5-A. It introduces instructions for lookup table reads with bit indices. This patch adds support for AdvSIMD lut intrinsics. The intrinsics for this extension are implemented as the following builtin functions: * vluti2{q}_lane{q}_{u8|s8|p8} * vluti2{q}_lane{q}_{u16|s16|p16|f16|bf16} * vluti4q_lane{q}_{u8|s8|p8} * vluti4q_lane{q}_{u16|s16|p16|f16|bf16}_x2 We also introduced a new approach to do lane checks for AdvSIMD. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (aarch64_builtin_signatures): Add binary_lane. (aarch64_fntype): Handle it. (simd_types): Add 16-bit x2 types. (aarch64_pragma_builtins_checker): New class. (aarch64_general_check_builtin_call): Use it. (aarch64_expand_pragma_builtin): Add support for lut unspecs. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Add lut option. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY_LANE): Modify to use new ENTRY macro. (ENTRY_TERNARY_VLUT8): Macro to declare lut intrinsics. (ENTRY_TERNARY_VLUT16): Macro to declare lut intrinsics. (REQUIRED_EXTENSIONS): Declare lut intrinsics. * config/aarch64/aarch64-simd.md (@aarch64_<vluti_uns_op><VLUT:mode><VB:mode>): Instruction pattern for luti2 and luti4 intrinsics. (@aarch64_lutx2<VLUT:mode><VB:mode>): Instruction pattern for luti4x2 intrinsics. * config/aarch64/aarch64.h (TARGET_LUT): lut flag. * config/aarch64/iterators.md: Iterators and attributes for lut. * doc/invoke.texi: Document extension in AArch64 Options. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/lut-incorrect-range.c: New test. * gcc.target/aarch64/simd/lut-no-flag.c: New test. * gcc.target/aarch64/simd/lut.c: New test. Co-authored-by: Vladimir Miloserdov <vladimir.miloserdov@arm.com> Co-authored-by: Richard Sandiford <richard.sandiford@arm.com>
-rw-r--r--gcc/config/aarch64/aarch64-builtins.cc132
-rw-r--r--gcc/config/aarch64/aarch64-option-extensions.def2
-rw-r--r--gcc/config/aarch64/aarch64-simd-pragma-builtins.def38
-rw-r--r--gcc/config/aarch64/aarch64-simd.md25
-rw-r--r--gcc/config/aarch64/aarch64.h3
-rw-r--r--gcc/config/aarch64/iterators.md14
-rw-r--r--gcc/doc/invoke.texi2
-rw-r--r--gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c221
-rw-r--r--gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c10
-rw-r--r--gcc/testsuite/gcc.target/aarch64/simd/lut.c849
10 files changed, 1294 insertions, 2 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index 8984f0c..f8c8a27 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -50,6 +50,8 @@
#include "builtins.h"
#include "aarch64-builtins.h"
+using namespace aarch64;
+
#define v8qi_UP E_V8QImode
#define v8di_UP E_V8DImode
#define v4hi_UP E_V4HImode
@@ -1600,6 +1602,7 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma)
enum class aarch64_builtin_signatures
{
binary,
+ binary_lane,
};
namespace {
@@ -1623,15 +1626,20 @@ namespace simd_types {
constexpr simd_type f16 { V4HFmode, qualifier_none };
constexpr simd_type f16q { V8HFmode, qualifier_none };
+ constexpr simd_type f16qx2 { V2x8HFmode, qualifier_none };
constexpr simd_type p16 { V4HImode, qualifier_poly };
constexpr simd_type p16q { V8HImode, qualifier_poly };
+ constexpr simd_type p16qx2 { V2x8HImode, qualifier_poly };
constexpr simd_type s16 { V4HImode, qualifier_none };
constexpr simd_type s16q { V8HImode, qualifier_none };
+ constexpr simd_type s16qx2 { V2x8HImode, qualifier_none };
constexpr simd_type u16 { V4HImode, qualifier_unsigned };
constexpr simd_type u16q { V8HImode, qualifier_unsigned };
+ constexpr simd_type u16qx2 { V2x8HImode, qualifier_unsigned };
constexpr simd_type bf16 { V4BFmode, qualifier_none };
constexpr simd_type bf16q { V8BFmode, qualifier_none };
+ constexpr simd_type bf16qx2 { V2x8BFmode, qualifier_none };
constexpr simd_type f32 { V2SFmode, qualifier_none };
constexpr simd_type f32q { V4SFmode, qualifier_none };
@@ -1671,11 +1679,21 @@ aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
switch (builtin_data.signature)
{
case aarch64_builtin_signatures::binary:
+ case aarch64_builtin_signatures::binary_lane:
return_type = builtin_data.types[0].type ();
for (int i = 1; i <= 2; ++i)
arg_types.quick_push (builtin_data.types[i].type ());
break;
}
+ switch (builtin_data.signature)
+ {
+ case aarch64_builtin_signatures::binary_lane:
+ arg_types.quick_push (integer_type_node);
+ break;
+
+ default:
+ break;
+ }
return build_function_type_array (return_type, arg_types.length (),
arg_types.address ());
}
@@ -2522,17 +2540,109 @@ aarch64_general_required_extensions (unsigned int code)
return ext::streaming_compatible (0);
}
+/* Checks calls to intrinsics that are defined using
+ aarch64-simd-pragma-builtins.def. */
+struct aarch64_pragma_builtins_checker
+{
+ aarch64_pragma_builtins_checker (location_t, tree, unsigned int, tree *,
+ const aarch64_pragma_builtins_data &);
+
+ bool require_immediate_range (unsigned int, HOST_WIDE_INT,
+ HOST_WIDE_INT);
+
+ bool check ();
+
+ location_t location;
+ tree fndecl;
+ unsigned int nargs;
+ array_slice<tree> args;
+ const aarch64_pragma_builtins_data &builtin_data;
+};
+
+/* LOCATION is the location of the call; FNDECL is the FUNCTION_DECL
+ that is being called; NARGS is the number of arguments to the call,
+ which are in a vector starting at FIRST_ARG; and BUILTIN_DATA describes
+ the intrinsic. */
+aarch64_pragma_builtins_checker::
+aarch64_pragma_builtins_checker (location_t location, tree fndecl,
+ unsigned int nargs, tree *first_arg,
+ const aarch64_pragma_builtins_data
+ &builtin_data)
+ : location (location), fndecl (fndecl), nargs (nargs),
+ args (first_arg, nargs), builtin_data (builtin_data)
+{
+}
+
+/* Require argument ARGNO to be an integer constant expression in the
+ range [MIN, MAX]. Return true if it was. */
+bool
+aarch64_pragma_builtins_checker::
+require_immediate_range (unsigned int argno, HOST_WIDE_INT min,
+ HOST_WIDE_INT max)
+{
+ if (!tree_fits_shwi_p (args[argno]))
+ {
+ report_non_ice (location, fndecl, argno);
+ return false;
+ }
+
+ HOST_WIDE_INT actual = tree_to_shwi (args[argno]);
+ if (actual < min || actual > max)
+ {
+ report_out_of_range (location, fndecl, argno, actual, min, max);
+ return false;
+ }
+
+ return true;
+}
+
+/* Check the arguments to the intrinsic call and return true if they
+ are valid. */
+bool
+aarch64_pragma_builtins_checker::check ()
+{
+ switch (builtin_data.unspec)
+ {
+ case UNSPEC_LUTI2:
+ case UNSPEC_LUTI4:
+ {
+ auto vector_to_index_mode = builtin_data.types[nargs - 1].mode;
+ int vector_to_index_nunits
+ = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
+ int output_mode_nunits
+ = GET_MODE_NUNITS (builtin_data.types[0].mode).to_constant ();
+
+ int high;
+ if (builtin_data.unspec == UNSPEC_LUTI2)
+ high = (4 * vector_to_index_nunits / output_mode_nunits) - 1;
+ else
+ high = (2 * vector_to_index_nunits / output_mode_nunits) - 1;
+
+ return require_immediate_range (nargs - 1, 0, high);
+ }
+
+ default:
+ return true;
+ }
+}
+
bool
aarch64_general_check_builtin_call (location_t location, vec<location_t>,
unsigned int code, tree fndecl,
- unsigned int nargs ATTRIBUTE_UNUSED,
- tree *args)
+ unsigned int nargs, tree *args)
{
tree decl = aarch64_builtin_decls[code];
auto required_extensions = aarch64_general_required_extensions (code);
if (!aarch64_check_required_extensions (location, decl, required_extensions))
return false;
+ if (auto builtin_data = aarch64_get_pragma_builtin (code))
+ {
+ aarch64_pragma_builtins_checker checker (location, fndecl, nargs, args,
+ *builtin_data);
+ return checker.check ();
+ }
+
switch (code)
{
case AARCH64_RSR:
@@ -3442,6 +3552,16 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
TYPE_MODE (TREE_TYPE (arg)));
}
+ /* LUTI2 treats the first argument as a vector of 4 elements. The forms
+ with 128-bit inputs are only provided as a convenience; the upper halves
+ don't actually matter. */
+ if (builtin_data.unspec == UNSPEC_LUTI2
+ && known_eq (GET_MODE_BITSIZE (ops[1].mode), 128u))
+ {
+ ops[1].mode = aarch64_v64_mode (GET_MODE_INNER (ops[1].mode)).require ();
+ ops[1].value = gen_lowpart (ops[1].mode, ops[1].value);
+ }
+
insn_code icode;
switch (builtin_data.unspec)
{
@@ -3450,6 +3570,14 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
icode = code_for_aarch64 (builtin_data.unspec,
builtin_data.types[0].mode);
break;
+
+ case UNSPEC_LUTI2:
+ case UNSPEC_LUTI4:
+ create_integer_operand (ops.safe_push ({}),
+ builtin_data.unspec == UNSPEC_LUTI2 ? 2 : 4);
+ icode = code_for_aarch64_lut (ops[1].mode, ops[2].mode);
+ break;
+
default:
gcc_unreachable ();
}
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 90abb1c..0a61b48 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -259,6 +259,8 @@ AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (FP8DOT4), (), (), "fp8dot2")
AARCH64_OPT_EXTENSION("ssve-fp8dot2", SSVE_FP8DOT2, (SSVE_FP8DOT4), (), (), "ssve-fp8dot2")
+AARCH64_OPT_EXTENSION("lut", LUT, (SIMD), (), (), "lut")
+
#undef AARCH64_OPT_FMV_EXTENSION
#undef AARCH64_OPT_EXTENSION
#undef AARCH64_FMV_FEATURE
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index e49db23..db40745 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -22,6 +22,10 @@
#define ENTRY_BINARY(N, T0, T1, T2, U) \
ENTRY (N, binary, T0, T1, T2, U)
+#undef ENTRY_BINARY_LANE
+#define ENTRY_BINARY_LANE(N, T0, T1, T2, U) \
+ ENTRY (N, binary_lane, T0, T1, T2, U)
+
#undef ENTRY_BINARY_VHSDF
#define ENTRY_BINARY_VHSDF(NAME, UNSPEC) \
ENTRY_BINARY (NAME##_f16, f16, f16, f16, UNSPEC) \
@@ -30,8 +34,42 @@
ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC) \
ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC)
+#undef ENTRY_TERNARY_VLUT8
+#define ENTRY_TERNARY_VLUT8(T) \
+ ENTRY_BINARY_LANE (vluti2_lane_##T##8, T##8q, T##8, u8, UNSPEC_LUTI2) \
+ ENTRY_BINARY_LANE (vluti2_laneq_##T##8, T##8q, T##8, u8q, UNSPEC_LUTI2) \
+ ENTRY_BINARY_LANE (vluti2q_lane_##T##8, T##8q, T##8q, u8, UNSPEC_LUTI2) \
+ ENTRY_BINARY_LANE (vluti2q_laneq_##T##8, T##8q, T##8q, u8q, UNSPEC_LUTI2) \
+ ENTRY_BINARY_LANE (vluti4q_lane_##T##8, T##8q, T##8q, u8, UNSPEC_LUTI4) \
+ ENTRY_BINARY_LANE (vluti4q_laneq_##T##8, T##8q, T##8q, u8q, UNSPEC_LUTI4)
+
+#undef ENTRY_TERNARY_VLUT16
+#define ENTRY_TERNARY_VLUT16(T) \
+ ENTRY_BINARY_LANE (vluti2_lane_##T##16, T##16q, T##16, u8, UNSPEC_LUTI2) \
+ ENTRY_BINARY_LANE (vluti2_laneq_##T##16, T##16q, T##16, u8q, UNSPEC_LUTI2) \
+ ENTRY_BINARY_LANE (vluti2q_lane_##T##16, T##16q, T##16q, u8, UNSPEC_LUTI2) \
+ ENTRY_BINARY_LANE (vluti2q_laneq_##T##16, T##16q, T##16q, u8q, \
+ UNSPEC_LUTI2) \
+ ENTRY_BINARY_LANE (vluti4q_lane_##T##16_x2, T##16q, T##16qx2, u8, \
+ UNSPEC_LUTI4) \
+ ENTRY_BINARY_LANE (vluti4q_laneq_##T##16_x2, T##16q, T##16qx2, u8q, \
+ UNSPEC_LUTI4)
+
// faminmax
#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX)
ENTRY_BINARY_VHSDF (vamin, UNSPEC_FAMIN)
#undef REQUIRED_EXTENSIONS
+
+// lut
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_LUT)
+ENTRY_TERNARY_VLUT8 (p)
+ENTRY_TERNARY_VLUT8 (s)
+ENTRY_TERNARY_VLUT8 (u)
+
+ENTRY_TERNARY_VLUT16 (bf)
+ENTRY_TERNARY_VLUT16 (f)
+ENTRY_TERNARY_VLUT16 (p)
+ENTRY_TERNARY_VLUT16 (s)
+ENTRY_TERNARY_VLUT16 (u)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cfe95bd..05cbd38 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -9999,3 +9999,28 @@
"TARGET_FAMINMAX"
"<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
)
+
+(define_insn "@aarch64_lut<VLUT:mode><VB:mode>"
+ [(set (match_operand:<VLUT:VCONQ> 0 "register_operand" "=w")
+ (unspec:<VLUT:VCONQ>
+ [(match_operand:VLUT 1 "register_operand" "w")
+ (match_operand:VB 2 "register_operand" "w")
+ (match_operand:SI 3 "const_int_operand")
+ (match_operand:SI 4 "const_int_operand")]
+ UNSPEC_LUTI))]
+ "TARGET_LUT && INTVAL (operands[4]) <= exact_log2 (<VLUT:nunits>)"
+ "luti%4\t%0<VLUT:Vconqtype>, {%1<VLUT:Vconqtype>}, %2[%3]"
+)
+
+;; lutx2
+(define_insn "@aarch64_lut<VLUTx2:mode><VB:mode>"
+ [(set (match_operand:<VSTRUCT_ELT> 0 "register_operand" "=w")
+ (unspec:<VSTRUCT_ELT>
+ [(match_operand:VLUTx2 1 "register_operand" "w")
+ (match_operand:VB 2 "register_operand" "w")
+ (match_operand:SI 3 "const_int_operand")
+ (match_operand:SI 4 "const_int_operand")]
+ UNSPEC_LUTI))]
+ "TARGET_LUT && INTVAL (operands[4]) == 4"
+ "luti%4\t%0.8h, {%S1.8h, %T1.8h}, %2[%3]"
+)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c5dcbe1..b1c694e 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -487,6 +487,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
#define TARGET_FAMINMAX AARCH64_HAVE_ISA (FAMINMAX)
#define TARGET_SVE_FAMINMAX (TARGET_SVE && TARGET_FAMINMAX)
+/* Lookup table (LUTI) extension instructions are enabled through +lut. */
+#define TARGET_LUT AARCH64_HAVE_ISA (LUT)
+
/* Prefer different predicate registers for the output of a predicated
operation over re-using an existing input predicate. */
#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 720d79d..90725c7 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -426,6 +426,10 @@
(V8HF "TARGET_SIMD_F16INST")
V2SF V4SF])
+;; Modes available for Advanced SIMD lut operations.
+(define_mode_iterator VLUT [V8QI V16QI V4HI V4HF V4BF])
+(define_mode_iterator VLUTx2 [V2x8HI V2x8HF V2x8BF])
+
;; Iterators for single modes, for "@" patterns.
(define_mode_iterator VNx16QI_ONLY [VNx16QI])
(define_mode_iterator VNx16SI_ONLY [VNx16SI])
@@ -1109,6 +1113,9 @@
UNSPEC_FCVTXN ; Used in aarch64-simd.md.
UNSPEC_FAMAX ; Used in aarch64-simd.md.
UNSPEC_FAMIN ; Used in aarch64-simd.md.
+ UNSPEC_LUTI ; Used in aarch64-simd.md.
+ UNSPEC_LUTI2 ; Used in aarch64-simd.md.
+ UNSPEC_LUTI4 ; Used in aarch64-simd.md.
;; All used in aarch64-sve2.md
UNSPEC_ADDQV
@@ -1555,6 +1562,12 @@
(QI "8b") (HI "8b")
(V4BF "8b") (V8BF "16b")])
+;; Mode to double type mapping.
+(define_mode_attr Vconqtype [(V8QI ".16b") (V16QI ".16b")
+ (V4HI ".8h") (V8HI ".8h")
+ (V4HF ".8h") (V8HF ".8h")
+ (V4BF ".8h") (V8BF ".8h")])
+
;; Advanced SIMD vector structure to element modes.
(define_mode_attr VSTRUCT_ELT [(V2x8QI "V8QI") (V2x4HI "V4HI")
(V2x2SI "V2SI") (V2x1DI "DI")
@@ -1685,6 +1698,7 @@
(V2SI "V4SI") (V4SI "V4SI")
(DI "V2DI") (V2DI "V2DI")
(V4HF "V8HF") (V8HF "V8HF")
+ (V4BF "V8BF") (V8BF "V8BF")
(V2SF "V4SF") (V4SF "V4SF")
(V2DF "V2DF") (SI "V4SI")
(HI "V8HI") (QI "V16QI")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e27a92c..e3c2adc 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -22007,6 +22007,8 @@ Enable the fp8 (8-bit floating point) to half-precision 2-way dot product
extension in streaming mode.
@item faminmax
Enable the Floating Point Absolute Maximum/Minimum extension.
+@item lut
+Enable the Lookup Table extension.
@item sve-b16b16
Enable the SVE non-widening brain floating-point (@code{bf16}) extension.
This only has an effect when @code{sve2} or @code{sme2} are also enabled.
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c b/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c
new file mode 100644
index 0000000..24e5d46
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c
@@ -0,0 +1,221 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+lut" } */
+
+#include "arm_neon.h"
+
+void
+test_var(uint8x16_t a, uint8x8_t b, uint8x16_t c, int x)
+{
+ vluti2q_lane_u8(a, b, x); /* { dg-error {argument 3 of 'vluti2q_lane_u8' must be an integer constant expression} } */
+ vluti2q_laneq_u8(a, c, x); /* { dg-error {argument 3 of 'vluti2q_laneq_u8' must be an integer constant expression} } */
+ vluti4q_lane_u8(a, b, x); /* { dg-error {argument 3 of 'vluti4q_lane_u8' must be an integer constant expression} } */
+ vluti4q_laneq_u8(a, c, x); /* { dg-error {argument 3 of 'vluti4q_laneq_u8' must be an integer constant expression} } */
+}
+
+void
+test_vluti2_laneu8(uint8x8_t a, uint8x8_t b, uint8x16_t c, uint8x16_t d)
+{
+ vluti2_lane_u8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti2_lane_u8(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti2_laneq_u8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2_laneq_u8(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2q_lane_u8(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti2q_lane_u8(c, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti2q_laneq_u8(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2q_laneq_u8(c, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti2_lanes8(int8x8_t a, uint8x8_t b, int8x16_t c, uint8x16_t d)
+{
+ vluti2_lane_s8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti2_lane_s8(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti2_laneq_s8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2_laneq_s8(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2q_lane_s8(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti2q_lane_s8(c, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti2q_laneq_s8(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2q_laneq_s8(c, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti2_lanep8(poly8x8_t a, uint8x8_t b, poly8x16_t c, uint8x16_t d)
+{
+ vluti2_lane_p8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti2_lane_p8(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti2_laneq_p8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2_laneq_p8(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2q_lane_p8(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti2q_lane_p8(c, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti2q_laneq_p8(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2q_laneq_p8(c, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti2_laneu16(uint16x4_t a, uint8x8_t b, uint16x8_t c, uint8x16_t d)
+{
+ vluti2_lane_u16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2_lane_u16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2_laneq_u16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2_laneq_u16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+
+ vluti2q_lane_u16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2q_lane_u16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2q_laneq_u16(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2q_laneq_u16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti2_lanes16(int16x4_t a, uint8x8_t b, int16x8_t c, uint8x16_t d)
+{
+ vluti2_lane_s16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2_lane_s16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2_laneq_s16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2_laneq_s16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+
+ vluti2q_lane_s16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2q_lane_s16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2q_laneq_s16(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2q_laneq_s16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti2_lanep16(poly16x4_t a, uint8x8_t b, poly16x8_t c, uint8x16_t d)
+{
+ vluti2_lane_p16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2_lane_p16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2_laneq_p16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2_laneq_p16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+
+ vluti2q_lane_p16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2q_lane_p16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2q_laneq_p16(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2q_laneq_p16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti2_lanef16(float16x4_t a, uint8x8_t b, float16x8_t c, uint8x16_t d)
+{
+ vluti2_lane_f16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2_lane_f16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2_laneq_f16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2_laneq_f16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+
+ vluti2q_lane_f16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2q_lane_f16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2q_laneq_f16(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2q_laneq_f16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti2_lanebf16(bfloat16x4_t a, uint8x8_t b, bfloat16x8_t c, uint8x16_t d)
+{
+ vluti2_lane_bf16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2_lane_bf16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2_laneq_bf16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2_laneq_bf16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+
+ vluti2q_lane_bf16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti2q_lane_bf16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+
+ vluti2q_laneq_bf16(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+ vluti2q_laneq_bf16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti4q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t d)
+{
+ vluti4q_lane_u8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects the value 0} } */
+ vluti4q_lane_u8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects the value 0} } */
+
+ vluti4q_laneq_u8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti4q_laneq_u8(a, d, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+}
+
+void
+test_vluti4q_lanes8(int8x16_t a, uint8x8_t b, uint8x16_t d)
+{
+ vluti4q_lane_s8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects the value 0} } */
+ vluti4q_lane_s8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects the value 0} } */
+
+ vluti4q_laneq_s8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti4q_laneq_s8(a, d, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+}
+
+void
+test_vluti4q_lanep8(poly8x16_t a, uint8x8_t b, uint8x16_t d)
+{
+ vluti4q_lane_p8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects the value 0} } */
+ vluti4q_lane_p8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects the value 0} } */
+
+ vluti4q_laneq_p8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti4q_laneq_p8(a, d, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+}
+
+void
+test_vluti4q_laneu16_x2(uint16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+ vluti4q_lane_u16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti4q_lane_u16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti4q_laneq_u16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti4q_laneq_u16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti4q_lanes16_x2(int16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+ vluti4q_lane_s16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti4q_lane_s16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti4q_laneq_s16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti4q_laneq_s16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti4q_lanep16_x2(poly16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+ vluti4q_lane_p16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti4q_lane_p16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti4q_laneq_p16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti4q_laneq_p16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti4q_lanef16_x2(float16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+ vluti4q_lane_f16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti4q_lane_f16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti4q_laneq_f16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti4q_laneq_f16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti4q_lanebf16_x2(bfloat16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+ vluti4q_lane_bf16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+ vluti4q_lane_bf16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, which expects a value in the range \[0, 1\]} } */
+
+ vluti4q_laneq_bf16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+ vluti4q_laneq_bf16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, which expects a value in the range \[0, 3\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c
new file mode 100644
index 0000000..d180d8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a" } */
+
+#include "arm_neon.h"
+
+void
+test (uint8x8_t a, uint8x8_t b)
+{
+ vluti2_lane_u8 (a, b, 0); /* { dg-error {ACLE function 'vluti2_lane_u8' requires ISA extension 'lut'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut.c b/gcc/testsuite/gcc.target/aarch64/simd/lut.c
new file mode 100644
index 0000000..fc89b21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut.c
@@ -0,0 +1,849 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+lut" } */
+/* { dg-final { check-function-bodies "**" ""} } */
+
+#include "arm_neon.h"
+
+/*
+** test_vluti2_laneu8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti2_laneu8(uint8x8_t a, uint8x8_t b, uint8x16_t results[])
+{
+ results[0] = vluti2_lane_u8(a, b, 0);
+ results[1] = vluti2_lane_u8(a, b, 1);
+}
+
+/*
+** test_vluti2_lanequ8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2_lanequ8(uint8x8_t a, uint8x16_t b, uint8x16_t results[])
+{
+ results[0] = vluti2_laneq_u8(a, b, 0);
+ results[1] = vluti2_laneq_u8(a, b, 1);
+ results[2] = vluti2_laneq_u8(a, b, 2);
+ results[3] = vluti2_laneq_u8(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneu8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti2q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t results[])
+{
+ results[0] = vluti2q_lane_u8(a, b, 0);
+ results[1] = vluti2q_lane_u8(a, b, 1);
+}
+
+/*
+** test_vluti2q_lanequ8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2q_lanequ8(uint8x16_t a, uint8x16_t b, uint8x16_t results[])
+{
+ results[0] = vluti2q_laneq_u8(a, b, 0);
+ results[1] = vluti2q_laneq_u8(a, b, 1);
+ results[2] = vluti2q_laneq_u8(a, b, 2);
+ results[3] = vluti2q_laneq_u8(a, b, 3);
+}
+
+/*
+** test_vluti2_lanes8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti2_lanes8(int8x8_t a, uint8x8_t b, int8x16_t results[])
+{
+ results[0] = vluti2_lane_s8(a, b, 0);
+ results[1] = vluti2_lane_s8(a, b, 1);
+}
+
+/*
+** test_vluti2_laneqs8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2_laneqs8(int8x8_t a, uint8x16_t b, int8x16_t results[])
+{
+ results[0] = vluti2_laneq_s8(a, b, 0);
+ results[1] = vluti2_laneq_s8(a, b, 1);
+ results[2] = vluti2_laneq_s8(a, b, 2);
+ results[3] = vluti2_laneq_s8(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanes8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti2q_lanes8(int8x16_t a, uint8x8_t b, int8x16_t results[])
+{
+ results[0] = vluti2q_lane_s8(a, b, 0);
+ results[1] = vluti2q_lane_s8(a, b, 1);
+}
+
+/*
+** test_vluti2q_laneqs8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2q_laneqs8(int8x16_t a, uint8x16_t b, int8x16_t results[])
+{
+ results[0] = vluti2q_laneq_s8(a, b, 0);
+ results[1] = vluti2q_laneq_s8(a, b, 1);
+ results[2] = vluti2q_laneq_s8(a, b, 2);
+ results[3] = vluti2q_laneq_s8(a, b, 3);
+}
+
+/*
+** test_vluti2_lanep8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti2_lanep8(poly8x8_t a, uint8x8_t b, poly8x16_t results[])
+{
+ results[0] = vluti2_lane_p8(a, b, 0);
+ results[1] = vluti2_lane_p8(a, b, 1);
+}
+
+/*
+** test_vluti2_laneqp8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2_laneqp8(poly8x8_t a, uint8x16_t b, poly8x16_t results[])
+{
+ results[0] = vluti2_laneq_p8(a, b, 0);
+ results[1] = vluti2_laneq_p8(a, b, 1);
+ results[2] = vluti2_laneq_p8(a, b, 2);
+ results[3] = vluti2_laneq_p8(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanep8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti2q_lanep8(poly8x16_t a, uint8x8_t b, poly8x16_t results[])
+{
+ results[0] = vluti2q_lane_p8(a, b, 0);
+ results[1] = vluti2q_lane_p8(a, b, 1);
+}
+
+/*
+** test_vluti2q_laneqp8:
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[])
+{
+ results[0] = vluti2q_laneq_p8(a, b, 0);
+ results[1] = vluti2q_laneq_p8(a, b, 1);
+ results[2] = vluti2q_laneq_p8(a, b, 2);
+ results[3] = vluti2q_laneq_p8(a, b, 3);
+}
+
+/*
+** test_vluti2_laneu16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2_laneu16(uint16x4_t a, uint8x8_t b, uint16x8_t results[])
+{
+ results[0] = vluti2_lane_u16(a, b, 0);
+ results[1] = vluti2_lane_u16(a, b, 1);
+ results[2] = vluti2_lane_u16(a, b, 2);
+ results[3] = vluti2_lane_u16(a, b, 3);
+}
+
+/*
+** test_vluti2_lanequ16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2_lanequ16(uint16x4_t a, uint8x16_t b, uint16x8_t results[])
+{
+ results[0] = vluti2_laneq_u16(a, b, 0);
+ results[1] = vluti2_laneq_u16(a, b, 1);
+ results[2] = vluti2_laneq_u16(a, b, 2);
+ results[3] = vluti2_laneq_u16(a, b, 3);
+ results[4] = vluti2_laneq_u16(a, b, 4);
+ results[5] = vluti2_laneq_u16(a, b, 5);
+ results[6] = vluti2_laneq_u16(a, b, 6);
+ results[7] = vluti2_laneq_u16(a, b, 7);
+}
+
+/*
+** test_vluti2q_laneu16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2q_laneu16(uint16x8_t a, uint8x8_t b, uint16x8_t results[])
+{
+ results[0] = vluti2q_lane_u16(a, b, 0);
+ results[1] = vluti2q_lane_u16(a, b, 1);
+ results[2] = vluti2q_lane_u16(a, b, 2);
+ results[3] = vluti2q_lane_u16(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanequ16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2q_lanequ16(uint16x8_t a, uint8x16_t b, uint16x8_t results[])
+{
+ results[0] = vluti2q_laneq_u16(a, b, 0);
+ results[1] = vluti2q_laneq_u16(a, b, 1);
+ results[2] = vluti2q_laneq_u16(a, b, 2);
+ results[3] = vluti2q_laneq_u16(a, b, 3);
+ results[4] = vluti2q_laneq_u16(a, b, 4);
+ results[5] = vluti2q_laneq_u16(a, b, 5);
+ results[6] = vluti2q_laneq_u16(a, b, 6);
+ results[7] = vluti2q_laneq_u16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanes16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2_lanes16(int16x4_t a, uint8x8_t b, int16x8_t results[])
+{
+ results[0] = vluti2_lane_s16(a, b, 0);
+ results[1] = vluti2_lane_s16(a, b, 1);
+ results[2] = vluti2_lane_s16(a, b, 2);
+ results[3] = vluti2_lane_s16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqs16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2_laneqs16(int16x4_t a, uint8x16_t b, int16x8_t results[])
+{
+ results[0] = vluti2_laneq_s16(a, b, 0);
+ results[1] = vluti2_laneq_s16(a, b, 1);
+ results[2] = vluti2_laneq_s16(a, b, 2);
+ results[3] = vluti2_laneq_s16(a, b, 3);
+ results[4] = vluti2_laneq_s16(a, b, 4);
+ results[5] = vluti2_laneq_s16(a, b, 5);
+ results[6] = vluti2_laneq_s16(a, b, 6);
+ results[7] = vluti2_laneq_s16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanes16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2q_lanes16(int16x8_t a, uint8x8_t b, int16x8_t results[])
+{
+ results[0] = vluti2q_lane_s16(a, b, 0);
+ results[1] = vluti2q_lane_s16(a, b, 1);
+ results[2] = vluti2q_lane_s16(a, b, 2);
+ results[3] = vluti2q_lane_s16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqs16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2q_laneqs16(int16x8_t a, uint8x16_t b, int16x8_t results[])
+{
+ results[0] = vluti2q_laneq_s16(a, b, 0);
+ results[1] = vluti2q_laneq_s16(a, b, 1);
+ results[2] = vluti2q_laneq_s16(a, b, 2);
+ results[3] = vluti2q_laneq_s16(a, b, 3);
+ results[4] = vluti2q_laneq_s16(a, b, 4);
+ results[5] = vluti2q_laneq_s16(a, b, 5);
+ results[6] = vluti2q_laneq_s16(a, b, 6);
+ results[7] = vluti2q_laneq_s16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanep16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2_lanep16(poly16x4_t a, uint8x8_t b, poly16x8_t results[])
+{
+ results[0] = vluti2_lane_p16(a, b, 0);
+ results[1] = vluti2_lane_p16(a, b, 1);
+ results[2] = vluti2_lane_p16(a, b, 2);
+ results[3] = vluti2_lane_p16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqp16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2_laneqp16(poly16x4_t a, uint8x16_t b, poly16x8_t results[])
+{
+ results[0] = vluti2_laneq_p16(a, b, 0);
+ results[1] = vluti2_laneq_p16(a, b, 1);
+ results[2] = vluti2_laneq_p16(a, b, 2);
+ results[3] = vluti2_laneq_p16(a, b, 3);
+ results[4] = vluti2_laneq_p16(a, b, 4);
+ results[5] = vluti2_laneq_p16(a, b, 5);
+ results[6] = vluti2_laneq_p16(a, b, 6);
+ results[7] = vluti2_laneq_p16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanep16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2q_lanep16(poly16x8_t a, uint8x8_t b, poly16x8_t results[])
+{
+ results[0] = vluti2q_lane_p16(a, b, 0);
+ results[1] = vluti2q_lane_p16(a, b, 1);
+ results[2] = vluti2q_lane_p16(a, b, 2);
+ results[3] = vluti2q_lane_p16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqp16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2q_laneqp16(poly16x8_t a, uint8x16_t b, poly16x8_t results[])
+{
+ results[0] = vluti2q_laneq_p16(a, b, 0);
+ results[1] = vluti2q_laneq_p16(a, b, 1);
+ results[2] = vluti2q_laneq_p16(a, b, 2);
+ results[3] = vluti2q_laneq_p16(a, b, 3);
+ results[4] = vluti2q_laneq_p16(a, b, 4);
+ results[5] = vluti2q_laneq_p16(a, b, 5);
+ results[6] = vluti2q_laneq_p16(a, b, 6);
+ results[7] = vluti2q_laneq_p16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanef16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2_lanef16(float16x4_t a, uint8x8_t b, float16x8_t results[])
+{
+ results[0] = vluti2_lane_f16(a, b, 0);
+ results[1] = vluti2_lane_f16(a, b, 1);
+ results[2] = vluti2_lane_f16(a, b, 2);
+ results[3] = vluti2_lane_f16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqf16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2_laneqf16(float16x4_t a, uint8x16_t b, float16x8_t results[])
+{
+ results[0] = vluti2_laneq_f16(a, b, 0);
+ results[1] = vluti2_laneq_f16(a, b, 1);
+ results[2] = vluti2_laneq_f16(a, b, 2);
+ results[3] = vluti2_laneq_f16(a, b, 3);
+ results[4] = vluti2_laneq_f16(a, b, 4);
+ results[5] = vluti2_laneq_f16(a, b, 5);
+ results[6] = vluti2_laneq_f16(a, b, 6);
+ results[7] = vluti2_laneq_f16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanef16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2q_lanef16(float16x8_t a, uint8x8_t b, float16x8_t results[])
+{
+ results[0] = vluti2q_lane_f16(a, b, 0);
+ results[1] = vluti2q_lane_f16(a, b, 1);
+ results[2] = vluti2q_lane_f16(a, b, 2);
+ results[3] = vluti2q_lane_f16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqf16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2q_laneqf16(float16x8_t a, uint8x16_t b, float16x8_t results[])
+{
+ results[0] = vluti2q_laneq_f16(a, b, 0);
+ results[1] = vluti2q_laneq_f16(a, b, 1);
+ results[2] = vluti2q_laneq_f16(a, b, 2);
+ results[3] = vluti2q_laneq_f16(a, b, 3);
+ results[4] = vluti2q_laneq_f16(a, b, 4);
+ results[5] = vluti2q_laneq_f16(a, b, 5);
+ results[6] = vluti2q_laneq_f16(a, b, 6);
+ results[7] = vluti2q_laneq_f16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanebf16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2_lanebf16(bfloat16x4_t a, uint8x8_t b, bfloat16x8_t results[])
+{
+ results[0] = vluti2_lane_bf16(a, b, 0);
+ results[1] = vluti2_lane_bf16(a, b, 1);
+ results[2] = vluti2_lane_bf16(a, b, 2);
+ results[3] = vluti2_lane_bf16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqbf16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2_laneqbf16(bfloat16x4_t a, uint8x16_t b, bfloat16x8_t results[])
+{
+ results[0] = vluti2_laneq_bf16(a, b, 0);
+ results[1] = vluti2_laneq_bf16(a, b, 1);
+ results[2] = vluti2_laneq_bf16(a, b, 2);
+ results[3] = vluti2_laneq_bf16(a, b, 3);
+ results[4] = vluti2_laneq_bf16(a, b, 4);
+ results[5] = vluti2_laneq_bf16(a, b, 5);
+ results[6] = vluti2_laneq_bf16(a, b, 6);
+ results[7] = vluti2_laneq_bf16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanebf16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti2q_lanebf16(bfloat16x8_t a, uint8x8_t b, bfloat16x8_t results[])
+{
+ results[0] = vluti2q_lane_bf16(a, b, 0);
+ results[1] = vluti2q_lane_bf16(a, b, 1);
+ results[2] = vluti2q_lane_bf16(a, b, 2);
+ results[3] = vluti2q_lane_bf16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqbf16:
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+** ...
+** ret
+*/
+void
+test_vluti2q_laneqbf16(bfloat16x8_t a, uint8x16_t b, bfloat16x8_t results[])
+{
+ results[0] = vluti2q_laneq_bf16(a, b, 0);
+ results[1] = vluti2q_laneq_bf16(a, b, 1);
+ results[2] = vluti2q_laneq_bf16(a, b, 2);
+ results[3] = vluti2q_laneq_bf16(a, b, 3);
+ results[4] = vluti2q_laneq_bf16(a, b, 4);
+ results[5] = vluti2q_laneq_bf16(a, b, 5);
+ results[6] = vluti2q_laneq_bf16(a, b, 6);
+ results[7] = vluti2q_laneq_bf16(a, b, 7);
+}
+
+/*
+** test_vluti4q_laneu8:
+** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** ...
+** ret
+*/
+void
+test_vluti4q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t results[])
+{
+ results[0] = vluti4q_lane_u8(a, b, 0);
+}
+
+/*
+** test_vluti4q_lanequ8:
+** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti4q_lanequ8(uint8x16_t a, uint8x16_t b, uint8x16_t results[])
+{
+ results[0] = vluti4q_laneq_u8(a, b, 0);
+ results[1] = vluti4q_laneq_u8(a, b, 1);
+}
+
+/*
+** test_vluti4q_lanep8:
+** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** ...
+** ret
+*/
+void
+test_vluti4q_lanep8(poly8x16_t a, uint8x8_t b, poly8x16_t results[])
+{
+ results[0] = vluti4q_lane_p8(a, b, 0);
+}
+
+/*
+** test_vluti4q_laneqp8:
+** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti4q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[])
+{
+ results[0] = vluti4q_laneq_p8(a, b, 0);
+ results[1] = vluti4q_laneq_p8(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneu16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti4q_laneu16_x2(uint16x8x2_t a, uint8x8_t b, uint16x8_t results[])
+{
+ results[0] = vluti4q_lane_u16_x2(a, b, 0);
+ results[1] = vluti4q_lane_u16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_lanequ16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti4q_lanequ16_x2(uint16x8x2_t a, uint8x16_t b, uint16x8_t results[])
+{
+ results[0] = vluti4q_laneq_u16_x2(a, b, 0);
+ results[1] = vluti4q_laneq_u16_x2(a, b, 1);
+ results[2] = vluti4q_laneq_u16_x2(a, b, 2);
+ results[3] = vluti4q_laneq_u16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanes16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti4q_lanes16_x2(int16x8x2_t a, uint8x8_t b, int16x8_t results[])
+{
+ results[0] = vluti4q_lane_s16_x2(a, b, 0);
+ results[1] = vluti4q_lane_s16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqs16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti4q_laneqs16_x2(int16x8x2_t a, uint8x16_t b, int16x8_t results[])
+{
+ results[0] = vluti4q_laneq_s16_x2(a, b, 0);
+ results[1] = vluti4q_laneq_s16_x2(a, b, 1);
+ results[2] = vluti4q_laneq_s16_x2(a, b, 2);
+ results[3] = vluti4q_laneq_s16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanep16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti4q_lanep16_x2(poly16x8x2_t a, uint8x8_t b, poly16x8_t results[])
+{
+ results[0] = vluti4q_lane_p16_x2(a, b, 0);
+ results[1] = vluti4q_lane_p16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqp16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti4q_laneqp16_x2(poly16x8x2_t a, uint8x16_t b, poly16x8_t results[])
+{
+ results[0] = vluti4q_laneq_p16_x2(a, b, 0);
+ results[1] = vluti4q_laneq_p16_x2(a, b, 1);
+ results[2] = vluti4q_laneq_p16_x2(a, b, 2);
+ results[3] = vluti4q_laneq_p16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanef16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti4q_lanef16_x2(float16x8x2_t a, uint8x8_t b, float16x8_t results[])
+{
+ results[0] = vluti4q_lane_f16_x2(a, b, 0);
+ results[1] = vluti4q_lane_f16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqf16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti4q_laneqf16_x2(float16x8x2_t a, uint8x16_t b, float16x8_t results[])
+{
+ results[0] = vluti4q_laneq_f16_x2(a, b, 0);
+ results[1] = vluti4q_laneq_f16_x2(a, b, 1);
+ results[2] = vluti4q_laneq_f16_x2(a, b, 2);
+ results[3] = vluti4q_laneq_f16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanebf16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** ...
+** ret
+*/
+void
+test_vluti4q_lanebf16_x2(bfloat16x8x2_t a, uint8x8_t b, bfloat16x8_t results[])
+{
+ results[0] = vluti4q_lane_bf16_x2(a, b, 0);
+ results[1] = vluti4q_lane_bf16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqbf16_x2:
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+** ...
+** ret
+*/
+void
+test_vluti4q_laneqbf16_x2(bfloat16x8x2_t a, uint8x16_t b, bfloat16x8_t results[])
+{
+ results[0] = vluti4q_laneq_bf16_x2(a, b, 0);
+ results[1] = vluti4q_laneq_bf16_x2(a, b, 1);
+ results[2] = vluti4q_laneq_bf16_x2(a, b, 2);
+ results[3] = vluti4q_laneq_bf16_x2(a, b, 3);
+}