48 files changed, 4215 insertions, 164 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index 9d1d026..6b3e220 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -702,6 +702,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
   VREINTERPRETQ_BUILTINS
 
 #define AARCH64_SIMD_VGET_LOW_BUILTINS \
+  VGET_LOW_BUILTIN(mf8) \
   VGET_LOW_BUILTIN(f16) \
   VGET_LOW_BUILTIN(f32) \
   VGET_LOW_BUILTIN(f64) \
@@ -719,6 +720,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
   VGET_LOW_BUILTIN(bf16)
 
 #define AARCH64_SIMD_VGET_HIGH_BUILTINS \
+  VGET_HIGH_BUILTIN(mf8) \
   VGET_HIGH_BUILTIN(f16) \
   VGET_HIGH_BUILTIN(f32) \
   VGET_HIGH_BUILTIN(f64) \
@@ -1096,6 +1098,8 @@ aarch64_int_or_fp_type (machine_mode mode,
   switch (mode)
     {
     case E_QImode:
+      if (qualifiers & qualifier_modal_float)
+	return aarch64_mfp8_type_node;
       return QUAL_TYPE (QI);
     case E_HImode:
       return QUAL_TYPE (HI);
@@ -1333,6 +1337,16 @@ aarch64_init_simd_builtin_scalar_types (void)
 					     "__builtin_aarch64_simd_udi");
 }
 
+/* If MODE is a single Advanced SIMD vector, return the number of lanes in the
+   vector.  If MODE is an Advanced SIMD structure/tuple mode, return the number
+   of lanes in a single vector.  */
+static unsigned int
+aarch64_num_lanes (machine_mode mode)
+{
+  unsigned int nregs = targetm.hard_regno_nregs (V0_REGNUM, mode);
+  return exact_div (GET_MODE_NUNITS (mode), nregs).to_constant ();
+}
+
 /* Return a set of FLAG_* flags derived from FLAGS
    that describe what a function with result MODE could do,
    taking the command-line flags into account.  */
@@ -1620,9 +1634,15 @@ enum class aarch64_builtin_signatures
 {
   binary,
   binary_lane,
+  binary_two_lanes,
+  load,
+  load_lane,
+  store,
+  store_lane,
   ternary,
   ternary_lane,
   unary,
+  unary_lane,
 };
 
 namespace {
@@ -1631,22 +1651,27 @@ namespace {
    function argument type or return type.  */
 struct simd_type {
   tree type () const { return aarch64_simd_builtin_type (mode, qualifiers); }
+  unsigned nunits () const { return GET_MODE_NUNITS (mode).to_constant (); }
 
   machine_mode mode;
   aarch64_type_qualifiers qualifiers;
 };
 
 namespace simd_types {
-#define VARIANTS(BASE, D, Q, MODE, QUALIFIERS)			\
-  constexpr simd_type BASE { V##D##MODE, QUALIFIERS };		\
-  constexpr simd_type BASE##x2 { V2x##D##MODE, QUALIFIERS };	\
-  constexpr simd_type BASE##x3 { V3x##D##MODE, QUALIFIERS };	\
-  constexpr simd_type BASE##x4 { V4x##D##MODE, QUALIFIERS };	\
-  constexpr simd_type BASE##q { V##Q##MODE, QUALIFIERS };	\
-  constexpr simd_type BASE##qx2 { V2x##Q##MODE, QUALIFIERS };	\
-  constexpr simd_type BASE##qx3 { V3x##Q##MODE, QUALIFIERS };	\
-  constexpr simd_type BASE##qx4 { V4x##Q##MODE, QUALIFIERS };	\
-  constexpr simd_type BASE##_scalar { MODE, QUALIFIERS };
+#define VARIANTS(BASE, D, Q, MODE, QUALIFIERS)				\
+  constexpr simd_type BASE { V##D##MODE, QUALIFIERS };			\
+  constexpr simd_type BASE##x2 { V2x##D##MODE, QUALIFIERS };		\
+  constexpr simd_type BASE##x3 { V3x##D##MODE, QUALIFIERS };		\
+  constexpr simd_type BASE##x4 { V4x##D##MODE, QUALIFIERS };		\
+  constexpr simd_type BASE##q { V##Q##MODE, QUALIFIERS };		\
+  constexpr simd_type BASE##qx2 { V2x##Q##MODE, QUALIFIERS };		\
+  constexpr simd_type BASE##qx3 { V3x##Q##MODE, QUALIFIERS };		\
+  constexpr simd_type BASE##qx4 { V4x##Q##MODE, QUALIFIERS };		\
+  constexpr simd_type BASE##_scalar { MODE, QUALIFIERS };		\
+  constexpr simd_type BASE##_scalar_ptr					\
+    { MODE, aarch64_type_qualifiers (QUALIFIERS | qualifier_pointer) };	\
+  constexpr simd_type BASE##_scalar_const_ptr				\
+    { MODE, aarch64_type_qualifiers (QUALIFIERS | qualifier_const_pointer) };
 
   VARIANTS (mf8, 8, 16, QImode, qualifier_modal_float)
   VARIANTS (p8, 8, 16, QImode, qualifier_poly)
@@ -1707,27 +1732,50 @@ aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
     {
     case aarch64_builtin_signatures::binary:
     case aarch64_builtin_signatures::binary_lane:
+    case aarch64_builtin_signatures::load_lane:
       return_type = builtin_data.types[0].type ();
       for (int i = 1; i <= 2; ++i)
 	arg_types.quick_push (builtin_data.types[i].type ());
       break;
 
-    case aarch64_builtin_signatures::ternary:
-    case aarch64_builtin_signatures::ternary_lane:
+    case aarch64_builtin_signatures::binary_two_lanes:
+      /* binary_two_lanes has to be handled as a special case because indices
+	 interleave vectors.  */
       return_type = builtin_data.types[0].type ();
-      for (int i = 1; i <= 3; ++i)
-	arg_types.quick_push (builtin_data.types[i].type ());
+      arg_types.quick_push (builtin_data.types[1].type ());
+      arg_types.quick_push (integer_type_node);
+      arg_types.quick_push (builtin_data.types[2].type ());
+      arg_types.quick_push (integer_type_node);
       break;
 
+    case aarch64_builtin_signatures::load:
     case aarch64_builtin_signatures::unary:
+    case aarch64_builtin_signatures::unary_lane:
       return_type = builtin_data.types[0].type ();
       arg_types.quick_push (builtin_data.types[1].type ());
       break;
+
+    case aarch64_builtin_signatures::store:
+    case aarch64_builtin_signatures::store_lane:
+      return_type = void_type_node;
+      for (int i = 0; i <= 1; ++i)
+	arg_types.quick_push (builtin_data.types[i].type ());
+      break;
+
+    case aarch64_builtin_signatures::ternary:
+    case aarch64_builtin_signatures::ternary_lane:
+      return_type = builtin_data.types[0].type ();
+      for (int i = 1; i <= 3; ++i)
+	arg_types.quick_push (builtin_data.types[i].type ());
+      break;
     }
   switch (builtin_data.signature)
     {
     case aarch64_builtin_signatures::binary_lane:
+    case aarch64_builtin_signatures::load_lane:
+    case aarch64_builtin_signatures::store_lane:
     case aarch64_builtin_signatures::ternary_lane:
+    case aarch64_builtin_signatures::unary_lane:
       arg_types.quick_push (integer_type_node);
       break;
 
@@ -2654,8 +2702,9 @@ require_immediate_lane_index (unsigned int lane_argno, unsigned vec_argno,
 {
   auto vec_mode = TYPE_MODE (TREE_TYPE (args[vec_argno]));
   auto elt_mode = TYPE_MODE (TREE_TYPE (args[elt_argno]));
-  auto nunits = exact_div (GET_MODE_SIZE (vec_mode),
-			   GET_MODE_UNIT_SIZE (elt_mode)).to_constant ();
+  auto nunits = (aarch64_num_lanes (vec_mode)
+		 * GET_MODE_UNIT_SIZE (vec_mode)
+		 / GET_MODE_UNIT_SIZE (elt_mode));
   return require_immediate_range (lane_argno, 0, nunits - 1);
 }
 
@@ -2674,8 +2723,25 @@ require_immediate_lane_index (unsigned int lane_argno, unsigned int vec_argno)
 bool
 aarch64_pragma_builtins_checker::check ()
 {
+  auto &types = builtin_data.types;
+
   switch (builtin_data.unspec)
     {
+    case UNSPEC_DUP_LANE:
+    case UNSPEC_GET_LANE:
+    case UNSPEC_LD2_LANE:
+    case UNSPEC_LD3_LANE:
+    case UNSPEC_LD4_LANE:
+    case UNSPEC_SET_LANE:
+    case UNSPEC_ST1_LANE:
+    case UNSPEC_ST2_LANE:
+    case UNSPEC_ST3_LANE:
+    case UNSPEC_ST4_LANE:
+      return require_immediate_lane_index (nargs - 1, nargs - 2);
+
+    case UNSPEC_EXT:
+      return require_immediate_range (2, 0, types[2].nunits () - 1);
+
     case UNSPEC_FDOT_LANE_FP8:
       return require_immediate_lane_index (nargs - 2, nargs - 3, 0);
 
@@ -2695,11 +2761,8 @@ aarch64_pragma_builtins_checker::check ()
     case UNSPEC_LUTI2:
     case UNSPEC_LUTI4:
       {
-	auto vector_to_index_mode = builtin_data.types[nargs - 1].mode;
-	int vector_to_index_nunits
-	  = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
-	int output_mode_nunits
-	  = GET_MODE_NUNITS (builtin_data.types[0].mode).to_constant ();
+	auto vector_to_index_nunits = types[nargs - 1].nunits ();
+	int output_mode_nunits = types[0].nunits ();
 
 	int high;
 	if (builtin_data.unspec == UNSPEC_LUTI2)
@@ -2710,6 +2773,11 @@ aarch64_pragma_builtins_checker::check ()
 	return require_immediate_range (nargs - 1, 0, high);
       }
 
+    case UNSPEC_VEC_COPY:
+      /* & rather than && so that we report errors against both indices.  */
+      return (require_immediate_lane_index (1, 0)
+	      & require_immediate_lane_index (3, 2));
+
     default:
       return true;
     }
@@ -3622,6 +3690,52 @@ aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
   return ops[0].value;
 }
 
+/* Convert ptr_mode value OP to a Pmode value (for ILP32).  */
+static void
+aarch64_convert_address (expand_operand *op)
+{
+  op->value = convert_memory_address (Pmode, op->value);
+  op->mode = Pmode;
+}
+
+/* Dereference the pointer in OP, turning it into a memory reference to
+   NELTS instances of MEM_MODE.  */
+static void
+aarch64_dereference_pointer (expand_operand *op, machine_mode mem_mode,
+			     unsigned int nelts = 1)
+{
+  if (nelts == 1)
+    {
+      op->value = gen_rtx_MEM (mem_mode, op->value);
+      op->mode = mem_mode;
+    }
+  else
+    {
+      op->value = gen_rtx_MEM (BLKmode, op->value);
+      op->mode = BLKmode;
+      set_mem_size (op->value, GET_MODE_SIZE (mem_mode) * nelts);
+    }
+}
+
+/* OP contains an integer index into a vector or tuple of mode VEC_MODE.
+   Convert OP from an architectural lane number to a GCC lane number.  */
+static void
+aarch64_canonicalize_lane (expand_operand *op, machine_mode vec_mode)
+{
+  auto nunits = aarch64_num_lanes (vec_mode);
+  op->value = gen_int_mode (ENDIAN_LANE_N (nunits, UINTVAL (op->value)),
+			    SImode);
+}
+
+/* OP contains an integer index into a vector or tuple of mode VEC_MODE.
+   Convert OP from an architectural lane number to a vec_merge mask.  */
+static void
+aarch64_convert_to_lane_mask (expand_operand *op, machine_mode vec_mode)
+{
+  auto nunits = aarch64_num_lanes (vec_mode);
+  create_integer_operand (op, 1 << ENDIAN_LANE_N (nunits, INTVAL (op->value)));
+}
+
 /* If OP is a 128-bit vector, convert it to the equivalent 64-bit vector.
    Do nothing otherwise.  */
 static void
@@ -3634,6 +3748,56 @@ aarch64_convert_to_v64 (expand_operand *op)
     }
 }
 
+/* If OP is a 64-bit (half-register) vector or a structure of 64-bit vectors,
+   pack its contents into the smallest associated full-register mode,
+   padding with zeros if necessary.  Return true if padding was used.  */
+static bool
+aarch64_pack_into_v128s (expand_operand *op)
+{
+  bool padded = false;
+  unsigned int nregs = targetm.hard_regno_nregs (V0_REGNUM, op->mode);
+
+  /* Do nothing if the operand is already a full-register mode.  */
+  if (known_eq (nregs * UNITS_PER_VREG, GET_MODE_SIZE (op->mode)))
+    return padded;
+
+  auto elt_mode = GET_MODE_INNER (op->mode);
+  auto v64_mode = aarch64_v64_mode (elt_mode).require ();
+  auto v128_mode = aarch64_v128_mode (elt_mode).require ();
+
+  auto new_mode = v128_mode;
+  if (nregs > 2)
+    new_mode = aarch64_advsimd_vector_array_mode (v128_mode, CEIL (nregs, 2))
+      .require ();
+
+  /* Get enough V64_MODE inputs to fill NEW_MDOE, which is made up of a
+     whole number of V128_MODEs.  */
+  auto_vec<rtx, 4> inputs;
+  for (unsigned int i = 0; i < nregs; ++i)
+    {
+      rtx input = simplify_gen_subreg (v64_mode, op->value, op->mode,
+				       i * GET_MODE_SIZE (v64_mode));
+      inputs.quick_push (input);
+    }
+  if (nregs & 1)
+    {
+      inputs.quick_push (CONST0_RTX (v64_mode));
+      padded = true;
+    }
+
+  /* Create a NEW_MODE register and build it up from individual V128_MODEs.  */
+  op->mode = new_mode;
+  op->value = gen_reg_rtx (new_mode);
+  for (unsigned int i = 0; i < inputs.length (); i += 2)
+    {
+      rtx result = gen_rtx_SUBREG (v128_mode, op->value,
+				   i * GET_MODE_SIZE (v64_mode));
+      emit_insn (gen_aarch64_combine (v64_mode, result,
+				      inputs[i], inputs[i + 1]));
+    }
+  return padded;
+}
+
 /* UNSPEC is a high unspec, indicated by "2" in mnemonics and "_high" in
    intrinsic names.  Return the equivalent low unspec.  */
 static int
@@ -3652,6 +3816,88 @@ aarch64_get_low_unspec (int unspec)
     }
 }
 
+/* OPS contains the operands for one of the permute pair functions vtrn,
+   vuzp or vzip.  Expand the call, given that PERMUTE1 is the unspec for
+   the first permute and PERMUTE2 is the unspec for the second permute.  */
+static rtx
+aarch64_expand_permute_pair (vec<expand_operand> &ops, int permute1,
+			     int permute2)
+{
+  rtx op0 = force_reg (ops[1].mode, ops[1].value);
+  rtx op1 = force_reg (ops[2].mode, ops[2].value);
+  rtx target = gen_reg_rtx (ops[0].mode);
+  rtx target0 = gen_rtx_SUBREG (ops[1].mode, target, 0);
+  rtx target1 = gen_rtx_SUBREG (ops[1].mode, target,
+				GET_MODE_SIZE (ops[1].mode));
+  emit_insn (gen_aarch64 (permute1, ops[1].mode, target0, op0, op1));
+  emit_insn (gen_aarch64 (permute2, ops[1].mode, target1, op0, op1));
+  return target;
+}
+
+/* Emit a TBL or TBX instruction with inputs INPUTS and a result of mode
+   MODE.  Return the result of the instruction.
+
+   UNSPEC is either UNSPEC_TBL or UNSPEC_TBX.  The inputs must already be in
+   registers.  */
+static rtx
+aarch64_expand_tbl_tbx (vec<rtx> &inputs, int unspec, machine_mode mode)
+{
+  rtx result = gen_reg_rtx (mode);
+  rtvec vec = gen_rtvec_v (inputs.length (), inputs.address ());
+  emit_insn (gen_rtx_SET (result, gen_rtx_UNSPEC (mode, vec, unspec)));
+  return result;
+}
+
+/* Emit a TBL or TBX intrinsic with the operands given by OPS.  Return the
+   result of the intrinsic.
+
+   UNSPEC is either UNSPEC_TBL or UNSPEC_TBX.  */
+static rtx
+aarch64_expand_tbl_tbx (vec<expand_operand> &ops, int unspec)
+{
+  for (unsigned int i = 1; i < ops.length (); ++i)
+    ops[i].value = force_reg (ops[i].mode, ops[i].value);
+
+  /* Handle the legacy forms for which the table is composed of 64-bit
+     rather than 128-bit vectors.  */
+  auto &table = ops[ops.length () - 2];
+  auto table_nelts = GET_MODE_NUNITS (table.mode);
+  bool padded = aarch64_pack_into_v128s (&table);
+
+  /* Packing to 128-bit vectors is enough for everything except the 64-bit
+     forms of vtbx1 and vtbx3, where we need to handle the zero padding.  */
+  if (unspec == UNSPEC_TBL || !padded)
+    {
+      auto_vec<rtx, 3> inputs;
+      for (unsigned int i = 1; i < ops.length (); ++i)
+	inputs.quick_push (ops[i].value);
+      return aarch64_expand_tbl_tbx (inputs, unspec, ops[0].mode);
+    }
+
+  /* Generate a TBL, which will give the right results for indices that
+     are less than TABLE_NELTS.  */
+  auto_vec<rtx, 2> inputs;
+  for (unsigned int i = 2; i < ops.length (); ++i)
+    inputs.quick_push (ops[i].value);
+  rtx tbl_result = aarch64_expand_tbl_tbx (inputs, UNSPEC_TBL, ops[0].mode);
+
+  /* Get a mask of the indices that are less than TABLE_NELTS.  */
+  auto &indices = ops.last ();
+  rtx cmp_result = gen_reg_rtx (indices.mode);
+  rtx bound_rtx = gen_int_mode (table_nelts, GET_MODE_INNER (indices.mode));
+  rtx bound_vec_rtx = gen_const_vec_duplicate (indices.mode, bound_rtx);
+  emit_insn (gen_aarch64_cm (GTU, indices.mode, cmp_result,
+			     force_reg (indices.mode, bound_vec_rtx),
+			     indices.value));
+
+  /* Select from the TBL result if the index is less than TABLE_NELTS
+     and from OPS[1] otherwise.  */
+  rtx result = gen_reg_rtx (ops[0].mode);
+  auto icode = get_vcond_mask_icode (ops[0].mode, indices.mode);
+  emit_insn (GEN_FCN (icode) (result, tbl_result, ops[1].value, cmp_result));
+  return result;
+}
+
 /* Expand CALL_EXPR EXP, given that it is a call to the function described
    by BUILTIN_DATA, and return the function's return value.  Put the result
    in TARGET if convenient.  */
@@ -3660,15 +3906,19 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
 			       const aarch64_pragma_builtins_data &builtin_data)
 {
   unsigned int nargs = call_expr_nargs (exp);
+  bool returns_void = VOID_TYPE_P (TREE_TYPE (exp));
 
   auto_vec<expand_operand, 8> ops;
-  ops.safe_grow (nargs + 1);
-  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (exp)));
-  for (unsigned int i = 1; i <= nargs; ++i)
+  if (!returns_void)
+    create_output_operand (ops.safe_push ({}), target,
+			   TYPE_MODE (TREE_TYPE (exp)));
+  for (unsigned int i = 0; i < nargs; ++i)
     {
-      tree arg = CALL_EXPR_ARG (exp, i - 1);
-      create_input_operand (&ops[i], expand_normal (arg),
+      tree arg = CALL_EXPR_ARG (exp, i);
+      create_input_operand (ops.safe_push ({}), expand_normal (arg),
 			    TYPE_MODE (TREE_TYPE (arg)));
+      if (POINTER_TYPE_P (TREE_TYPE (arg)))
+	aarch64_convert_address (&ops.last ());
     }
 
   if (builtin_data.flags & FLAG_USES_FPMR)
@@ -3698,12 +3948,43 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
   insn_code icode;
   switch (builtin_data.unspec)
     {
+    case UNSPEC_BSL:
+      icode = code_for_aarch64_simd_bsl (ops[0].mode);
+      break;
+
+    case UNSPEC_COMBINE:
+      icode = code_for_aarch64_combine (ops[1].mode);
+      break;
+
+    case UNSPEC_DUP:
+      if (builtin_data.signature == aarch64_builtin_signatures::load)
+	aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode));
+      return expand_vector_broadcast (ops[0].mode, ops[1].value);
+
+    case UNSPEC_DUP_LANE:
+      aarch64_canonicalize_lane (&ops[2], ops[1].mode);
+      if (ops[0].mode == ops[1].mode)
+	icode = code_for_aarch64_dup_lane (ops[0].mode);
+      else
+	icode = code_for_aarch64_dup_lane (ops[0].mode, ops[0].mode);
+      break;
+
+    case UNSPEC_EXT:
+      icode = code_for_aarch64_ext (ops[0].mode);
+      break;
+
     case UNSPEC_FAMAX:
     case UNSPEC_FAMIN:
     case UNSPEC_F1CVTL_FP8:
     case UNSPEC_F2CVTL_FP8:
     case UNSPEC_FDOT_FP8:
     case UNSPEC_FSCALE:
+    case UNSPEC_TRN1:
+    case UNSPEC_TRN2:
+    case UNSPEC_UZP1:
+    case UNSPEC_UZP2:
+    case UNSPEC_ZIP1:
+    case UNSPEC_ZIP2:
       icode = code_for_aarch64 (builtin_data.unspec, ops[0].mode);
       break;
 
@@ -3737,6 +4018,7 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
       }
 
     case UNSPEC_FDOT_LANE_FP8:
+      /* This pattern does not canonicalize the lane number.  */
       icode = code_for_aarch64_lane (builtin_data.unspec,
 				     ops[0].mode, ops[3].mode);
       break;
@@ -3749,8 +4031,7 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
     case UNSPEC_FMLALLTT_FP8:
       if (builtin_data.signature == aarch64_builtin_signatures::ternary_lane)
 	{
-	  ops[4].value = aarch64_endian_lane_rtx (ops[3].mode,
-						  INTVAL (ops[4].value));
+	  aarch64_canonicalize_lane (&ops[4], ops[3].mode);
 	  icode = code_for_aarch64_lane (builtin_data.unspec,
 					 ops[0].mode, ops[3].mode);
 	}
@@ -3760,6 +4041,55 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
 	gcc_unreachable ();
       break;
 
+    case UNSPEC_GET_LANE:
+      aarch64_canonicalize_lane (&ops[2], ops[1].mode);
+      icode = code_for_aarch64_get_lane (ops[1].mode);
+      break;
+
+    case UNSPEC_LD1:
+      icode = code_for_aarch64_ld1 (ops[0].mode);
+      break;
+
+    case UNSPEC_LD1x2:
+      icode = code_for_aarch64_ld1x2 (ops[0].mode);
+      break;
+
+    case UNSPEC_LD1x3:
+      icode = code_for_aarch64_ld1x3 (ops[0].mode);
+      break;
+
+    case UNSPEC_LD1x4:
+      icode = code_for_aarch64_ld1x4 (ops[0].mode);
+      break;
+
+    case UNSPEC_LD2:
+    case UNSPEC_LD3:
+    case UNSPEC_LD4:
+      icode = code_for_aarch64_ld (ops[0].mode, ops[0].mode);
+      break;
+
+    case UNSPEC_LD2_DUP:
+      aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode), 2);
+      icode = code_for_aarch64_simd_ld2r (ops[0].mode);
+      break;
+
+    case UNSPEC_LD3_DUP:
+      aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode), 3);
+      icode = code_for_aarch64_simd_ld3r (ops[0].mode);
+      break;
+
+    case UNSPEC_LD4_DUP:
+      aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode), 4);
+      icode = code_for_aarch64_simd_ld4r (ops[0].mode);
+      break;
+
+    case UNSPEC_LD2_LANE:
+    case UNSPEC_LD3_LANE:
+    case UNSPEC_LD4_LANE:
+      aarch64_canonicalize_lane (&ops[3], ops[2].mode);
+      icode = code_for_aarch64_ld_lane (ops[0].mode, ops[0].mode);
+      break;
+
     case UNSPEC_LUTI2:
     case UNSPEC_LUTI4:
       create_integer_operand (ops.safe_push ({}),
@@ -3767,6 +4097,86 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
       icode = code_for_aarch64_lut (ops[1].mode, ops[2].mode);
       break;
 
+    case UNSPEC_REV16:
+    case UNSPEC_REV32:
+    case UNSPEC_REV64:
+      icode = code_for_aarch64_rev (builtin_data.unspec, ops[0].mode);
+      break;
+
+    case UNSPEC_SET_LANE:
+      if (builtin_data.signature == aarch64_builtin_signatures::load_lane)
+	aarch64_dereference_pointer (&ops[1], GET_MODE_INNER (ops[0].mode));
+      /* The vec_set operand order is: dest, scalar, mask, vector.  */
+      std::swap (ops[2], ops[3]);
+      aarch64_convert_to_lane_mask (&ops[2], ops[3].mode);
+      icode = code_for_aarch64_simd_vec_set (ops[0].mode);
+      break;
+
+    case UNSPEC_ST1:
+      icode = code_for_aarch64_st1 (ops[1].mode);
+      break;
+
+    case UNSPEC_ST1_LANE:
+      aarch64_dereference_pointer (&ops[0], GET_MODE_INNER (ops[1].mode));
+      /* Reinterpret ops[0] as an output.  */
+      create_fixed_operand (&ops[0], ops[0].value);
+      aarch64_canonicalize_lane (&ops[2], ops[1].mode);
+      icode = code_for_aarch64_get_lane (ops[1].mode);
+      break;
+
+    case UNSPEC_ST1x2:
+      icode = code_for_aarch64_st1x2 (ops[1].mode);
+      break;
+
+    case UNSPEC_ST1x3:
+      icode = code_for_aarch64_st1x3 (ops[1].mode);
+      break;
+
+    case UNSPEC_ST1x4:
+      icode = code_for_aarch64_st1x4 (ops[1].mode);
+      break;
+
+    case UNSPEC_ST2:
+    case UNSPEC_ST3:
+    case UNSPEC_ST4:
+      icode = code_for_aarch64_st (ops[1].mode, ops[1].mode);
+      break;
+
+    case UNSPEC_ST2_LANE:
+    case UNSPEC_ST3_LANE:
+    case UNSPEC_ST4_LANE:
+      aarch64_canonicalize_lane (&ops[2], ops[1].mode);
+      icode = code_for_aarch64_st_lane (ops[1].mode, ops[1].mode);
+      break;
+
+    case UNSPEC_TBL:
+    case UNSPEC_TBX:
+      return aarch64_expand_tbl_tbx (ops, builtin_data.unspec);
+
+    case UNSPEC_TRN:
+      return aarch64_expand_permute_pair (ops, UNSPEC_TRN1, UNSPEC_TRN2);
+
+    case UNSPEC_UZP:
+      return aarch64_expand_permute_pair (ops, UNSPEC_UZP1, UNSPEC_UZP2);
+
+    case UNSPEC_VCREATE:
+      return force_lowpart_subreg (ops[0].mode, ops[1].value, ops[1].mode);
+
+    case UNSPEC_VEC_COPY:
+      {
+	aarch64_convert_to_lane_mask (&ops[2], ops[1].mode);
+	aarch64_canonicalize_lane (&ops[4], ops[3].mode);
+	if (ops[1].mode == ops[3].mode)
+	  icode = code_for_aarch64_simd_vec_copy_lane (ops[1].mode);
+	else
+	  icode = code_for_aarch64_simd_vec_copy_lane (ops[1].mode,
+						       ops[1].mode);
+	break;
+      }
+
+    case UNSPEC_ZIP:
+      return aarch64_expand_permute_pair (ops, UNSPEC_ZIP1, UNSPEC_ZIP2);
+
     default:
       gcc_unreachable ();
     }
@@ -4214,12 +4624,346 @@ aarch64_record_vector_load_arg (tree addr)
   cfun->machine->vector_load_decls->add (decl);
 }
 
+/* Force VAL into a valid gimple value, creating a new SSA_NAME if
+   necessary.  Insert any new statements before GSI.  */
+static tree
+aarch64_force_gimple_val (gimple_stmt_iterator *gsi, tree val)
+{
+  if (is_gimple_val (val))
+    return val;
+
+  tree tmp = make_ssa_name (TREE_TYPE (val));
+  gsi_insert_before_without_update (gsi, gimple_build_assign (tmp, val),
+				    GSI_SAME_STMT);
+  return tmp;
+}
+
+/* Copy vops from FROM to TO and return TO.  */
+static gimple *
+aarch64_copy_vops (gimple *to, gimple *from)
+{
+  gimple_set_vuse (to, gimple_vuse (from));
+  gimple_set_vdef (to, gimple_vdef (from));
+  return to;
+}
+
+/* Fold STMT (at GSI) to VAL, with SEQ setting up the value of VAL.
+   Return the replacement statement.  */
+static gimple *
+aarch64_fold_to_val (gcall *stmt, gimple_stmt_iterator *gsi,
+		     gimple *seq, tree val)
+{
+  auto *assign = gimple_build_assign (gimple_call_lhs (stmt), val);
+  gimple_seq_add_stmt_without_update (&seq, assign);
+  gsi_replace_with_seq_vops (gsi, seq);
+  return assign;
+}
+
+/* Dereference pointer ADDR, giving a memory reference of type TYPE.  */
+static tree
+aarch64_dereference (tree addr, tree type)
+{
+  tree elt_type = (VECTOR_TYPE_P (type) ? TREE_TYPE (type) : type);
+  tree elt_ptr_type = build_pointer_type_for_mode (elt_type, VOIDmode, true);
+  tree zero = build_zero_cst (elt_ptr_type);
+  /* Use element type alignment.  */
+  tree access_type = build_aligned_type (type, TYPE_ALIGN (elt_type));
+  return fold_build2 (MEM_REF, access_type, addr, zero);
+}
+
+/* LANE is a lane index into VEC.  Return the associated bit index
+   (counting from the first byte in memory order).  */
+static tree
+aarch64_get_lane_bit_index (tree vec, tree lane)
+{
+  auto vec_mode = TYPE_MODE (TREE_TYPE (vec));
+  auto nunits = aarch64_num_lanes (vec_mode);
+  auto idx = ENDIAN_LANE_N (nunits, tree_to_uhwi (lane));
+  return bitsize_int (idx * GET_MODE_UNIT_BITSIZE (vec_mode));
+}
+
+/* LANE is a lane index into VEC.  Return a BIT_FIELD_REF for the
+   selected element.  */
+static tree
+aarch64_get_lane (tree vec, tree lane)
+{
+  auto elt_type = TREE_TYPE (TREE_TYPE (vec));
+  return fold_build3 (BIT_FIELD_REF, elt_type, vec, TYPE_SIZE (elt_type),
+		      aarch64_get_lane_bit_index (vec, lane));
+}
+
+/* LANE is a lane index into VEC.  Return a BIT_INSERT_EXPR that replaces
+   that index with ELT and stores the result in LHS.  */
+static gimple *
+aarch64_set_lane (tree lhs, tree elt, tree vec, tree lane)
+{
+  tree bit = aarch64_get_lane_bit_index (vec, lane);
+  return gimple_build_assign (lhs, BIT_INSERT_EXPR, vec, elt, bit);
+}
+
+/* Fold a call to vcombine.  */
+static gimple *
+aarch64_fold_combine (gcall *stmt)
+{
+  tree first_part, second_part;
+  if (BYTES_BIG_ENDIAN)
+    {
+      second_part = gimple_call_arg (stmt, 0);
+      first_part = gimple_call_arg (stmt, 1);
+    }
+  else
+    {
+      first_part = gimple_call_arg (stmt, 0);
+      second_part = gimple_call_arg (stmt, 1);
+    }
+  tree ret_type = gimple_call_return_type (stmt);
+  tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, first_part,
+				    NULL_TREE, second_part);
+  return gimple_build_assign (gimple_call_lhs (stmt), ctor);
+}
+
+/* Fold a call to vld1, given that it loads something of type TYPE.  */
+static gimple *
+aarch64_fold_load (gcall *stmt, tree type)
+{
+  /* Punt until after inlining, so that we stand more chance of
+     recording something meaningful in vector_load_decls.  */
+  if (!cfun->after_inlining)
+    return nullptr;
+  tree addr = gimple_call_arg (stmt, 0);
+  aarch64_record_vector_load_arg (addr);
+  if (!BYTES_BIG_ENDIAN)
+    {
+      tree mem = aarch64_dereference (addr, type);
+      auto *new_stmt = gimple_build_assign (gimple_get_lhs (stmt), mem);
+      return aarch64_copy_vops (new_stmt, stmt);
+    }
+  return nullptr;
+}
+
+/* Fold a call to vst1, given that it loads something of type TYPE.  */
+static gimple *
+aarch64_fold_store (gcall *stmt, tree type)
+{
+  tree addr = gimple_call_arg (stmt, 0);
+  tree data = gimple_call_arg (stmt, 1);
+  if (!BYTES_BIG_ENDIAN)
+    {
+      tree mem = aarch64_dereference (addr, type);
+      auto *new_stmt = gimple_build_assign (mem, data);
+      return aarch64_copy_vops (new_stmt, stmt);
+    }
+  return nullptr;
+}
+
+/* An aarch64_fold_permute callback for vext.  SELECTOR is the value of
+   the final argument.  */
+static unsigned int
+aarch64_ext_index (unsigned int, unsigned int selector, unsigned int i)
+{
+  return selector + i;
+}
+
+/* An aarch64_fold_permute callback for vrev.  SELECTOR is the number
+   of elements in each reversal group.  */
+static unsigned int
+aarch64_rev_index (unsigned int, unsigned int selector, unsigned int i)
+{
+  return ROUND_DOWN (i, selector) + (selector - 1) - (i % selector);
+}
+
+/* An aarch64_fold_permute callback for vtrn.  SELECTOR is 0 for TRN1
+   and 1 for TRN2.  */
+static unsigned int
+aarch64_trn_index (unsigned int nelts, unsigned int selector, unsigned int i)
+{
+  return (i % 2) * nelts + ROUND_DOWN (i, 2) + selector;
+}
+
+/* An aarch64_fold_permute callback for vuzp.  SELECTOR is 0 for UZP1
+   and 1 for UZP2.  */
+static unsigned int
+aarch64_uzp_index (unsigned int, unsigned int selector, unsigned int i)
+{
+  return i * 2 + selector;
+}
+
+/* An aarch64_fold_permute callback for vzip.  SELECTOR is 0 for ZIP1
+   and 1 for ZIP2.  */
+static unsigned int
+aarch64_zip_index (unsigned int nelts, unsigned int selector, unsigned int i)
+{
+  return (i % 2) * nelts + (i / 2) + selector * (nelts / 2);
+}
+
+/* Fold STMT to a VEC_PERM_EXPR on the first NINPUTS arguments.
+   Make the VEC_PERM_EXPR emulate an NINPUTS-input TBL in which
+   architectural lane I of the result selects architectural lane:
+
+     GET_INDEX (NELTS, SELECTOR, I)
+
+   of the input table.  NELTS is the number of elements in one vector.  */
+static gimple *
+aarch64_fold_permute (gcall *stmt, unsigned int ninputs,
+		      unsigned int (*get_index) (unsigned int, unsigned int,
+						 unsigned int),
+		      unsigned int selector)
+{
+  tree op0 = gimple_call_arg (stmt, 0);
+  tree op1 = ninputs == 2 ? gimple_call_arg (stmt, 1) : op0;
+  auto nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0)).to_constant ();
+  vec_perm_builder sel (nelts, nelts, 1);
+  for (unsigned int i = 0; i < nelts; ++i)
+    {
+      unsigned int index = get_index (nelts, selector,
+				      ENDIAN_LANE_N (nelts, i));
+      unsigned int vec = index / nelts;
+      unsigned int elt = ENDIAN_LANE_N (nelts, index % nelts);
+      sel.quick_push (vec * nelts + elt);
+    }
+
+  vec_perm_indices indices (sel, ninputs, nelts);
+  tree mask_type = build_vector_type (ssizetype, nelts);
+  tree mask = vec_perm_indices_to_tree (mask_type, indices);
+  return gimple_build_assign (gimple_call_lhs (stmt), VEC_PERM_EXPR,
+			      op0, op1, mask);
+}
+
+/* Try to fold STMT (at GSI), given that it is a call to the builtin
+   described by BUILTIN_DATA.  Return the new statement on success,
+   otherwise return null.  */
+static gimple *
+aarch64_gimple_fold_pragma_builtin
+  (gcall *stmt, gimple_stmt_iterator *gsi,
+   const aarch64_pragma_builtins_data &builtin_data)
+{
+  auto &types = builtin_data.types;
+
+  switch (builtin_data.unspec)
+    {
+    case UNSPEC_COMBINE:
+      return aarch64_fold_combine (stmt);
+
+    case UNSPEC_DUP:
+    case UNSPEC_DUP_LANE:
+      {
+	tree arg = gimple_call_arg (stmt, 0);
+	tree type = types[0].type ();
+	if (builtin_data.signature == aarch64_builtin_signatures::load)
+	  arg = aarch64_dereference (arg, TREE_TYPE (type));
+	else if (builtin_data.unspec == UNSPEC_DUP_LANE)
+	  arg = aarch64_get_lane (arg, gimple_call_arg (stmt, 1));
+	arg = aarch64_force_gimple_val (gsi, arg);
+
+	tree dup = build_vector_from_val (type, arg);
+	return aarch64_fold_to_val (stmt, gsi, nullptr, dup);
+      }
+
+    case UNSPEC_EXT:
+      {
+	auto index = tree_to_uhwi (gimple_call_arg (stmt, 2));
+	return aarch64_fold_permute (stmt, 2, aarch64_ext_index, index);
+      }
+
+    case UNSPEC_GET_LANE:
+      {
+	tree val = aarch64_get_lane (gimple_call_arg (stmt, 0),
+				     gimple_call_arg (stmt, 1));
+	return gimple_build_assign (gimple_call_lhs (stmt), val);
+      }
+
+    case UNSPEC_LD1:
+      return aarch64_fold_load (stmt, types[0].type ());
+
+    case UNSPEC_REV16:
+      {
+	auto selector = 16 / GET_MODE_UNIT_BITSIZE (types[0].mode);
+	return aarch64_fold_permute (stmt, 1, aarch64_rev_index, selector);
+      }
+
+    case UNSPEC_REV32:
+      {
+	auto selector = 32 / GET_MODE_UNIT_BITSIZE (types[0].mode);
+	return aarch64_fold_permute (stmt, 1, aarch64_rev_index, selector);
+      }
+
+    case UNSPEC_REV64:
+      {
+	auto selector = 64 / GET_MODE_UNIT_BITSIZE (types[0].mode);
+	return aarch64_fold_permute (stmt, 1, aarch64_rev_index, selector);
+      }
+
+    case UNSPEC_SET_LANE:
+      {
+	tree elt = gimple_call_arg (stmt, 0);
+	if (builtin_data.signature == aarch64_builtin_signatures::load_lane)
+	  {
+	    elt = aarch64_dereference (elt, TREE_TYPE (types[0].type ()));
+	    elt = aarch64_force_gimple_val (gsi, elt);
+	  }
+	return aarch64_set_lane (gimple_call_lhs (stmt), elt,
+				 gimple_call_arg (stmt, 1),
+				 gimple_call_arg (stmt, 2));
+      }
+
+    case UNSPEC_ST1:
+      return aarch64_fold_store (stmt, types[1].type ());
+
+    case UNSPEC_ST1_LANE:
+      {
+	tree val = aarch64_get_lane (gimple_call_arg (stmt, 1),
+				     gimple_call_arg (stmt, 2));
+	tree mem = aarch64_dereference (gimple_call_arg (stmt, 0),
+					TREE_TYPE (types[0].type ()));
+	val = aarch64_force_gimple_val (gsi, val);
+	return aarch64_copy_vops (gimple_build_assign (mem, val), stmt);
+      }
+
+    case UNSPEC_TRN1:
+      return aarch64_fold_permute (stmt, 2, aarch64_trn_index, 0);
+
+    case UNSPEC_TRN2:
+      return aarch64_fold_permute (stmt, 2, aarch64_trn_index, 1);
+
+    case UNSPEC_UZP1:
+      return aarch64_fold_permute (stmt, 2, aarch64_uzp_index, 0);
+
+    case UNSPEC_UZP2:
+      return aarch64_fold_permute (stmt, 2, aarch64_uzp_index, 1);
+
+    case UNSPEC_VCREATE:
+      return gimple_build_assign (gimple_call_lhs (stmt),
+				  fold_build1 (VIEW_CONVERT_EXPR,
+					       types[0].type (),
+					       gimple_call_arg (stmt, 0)));
+
+    case UNSPEC_VEC_COPY:
+      {
+	tree elt = aarch64_get_lane (gimple_call_arg (stmt, 2),
+				     gimple_call_arg (stmt, 3));
+	elt = aarch64_force_gimple_val (gsi, elt);
+	return aarch64_set_lane (gimple_call_lhs (stmt), elt,
+				 gimple_call_arg (stmt, 0),
+				 gimple_call_arg (stmt, 1));
+      }
+
+    case UNSPEC_ZIP1:
+      return aarch64_fold_permute (stmt, 2, aarch64_zip_index, 0);
+
+    case UNSPEC_ZIP2:
+      return aarch64_fold_permute (stmt, 2, aarch64_zip_index, 1);
+
+    default:
+      return nullptr;
+    }
+}
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
 gimple *
 aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
-				     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED)
+				     gimple_stmt_iterator *gsi)
 {
   gimple *new_stmt = NULL;
   unsigned nargs = gimple_call_num_args (stmt);
@@ -4249,81 +4993,33 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
      BUILTIN_VDC (BINOP, combine, 0, QUIET)
      BUILTIN_VD_I (BINOPU, combine, 0, DEFAULT)
      BUILTIN_VDC_P (BINOPP, combine, 0, DEFAULT)
-	{
-	  tree first_part, second_part;
-	  if (BYTES_BIG_ENDIAN)
-	    {
-	      second_part = args[0];
-	      first_part = args[1];
-	    }
-	  else
-	    {
-	      first_part = args[0];
-	      second_part = args[1];
-	    }
-	  tree ret_type = gimple_call_return_type (stmt);
-	  tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, first_part,
-					    NULL_TREE, second_part);
-	  new_stmt = gimple_build_assign (gimple_call_lhs (stmt), ctor);
-	}
-	break;
+       new_stmt = aarch64_fold_combine (stmt);
+       break;
 
      /*lower store and load neon builtins to gimple.  */
      BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
      BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
      BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
-	/* Punt until after inlining, so that we stand more chance of
-	   recording something meaningful in vector_load_decls.  */
-	if (!cfun->after_inlining)
-	  break;
-	aarch64_record_vector_load_arg (args[0]);
-	if (!BYTES_BIG_ENDIAN)
-	  {
-	    enum aarch64_simd_type mem_type
-	      = get_mem_type_for_load_store(fcode);
-	    aarch64_simd_type_info_trees simd_type
-	      = aarch64_simd_types_trees[mem_type];
-	    tree elt_ptr_type = build_pointer_type_for_mode (simd_type.eltype,
-							     VOIDmode, true);
-	    tree zero = build_zero_cst (elt_ptr_type);
-	    /* Use element type alignment.  */
-	    tree access_type
-	      = build_aligned_type (simd_type.itype,
-				    TYPE_ALIGN (simd_type.eltype));
-	    new_stmt
-	      = gimple_build_assign (gimple_get_lhs (stmt),
-				     fold_build2 (MEM_REF,
-						  access_type,
-						  args[0], zero));
-	    gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-	    gimple_set_vdef (new_stmt, gimple_vdef (stmt));
-	  }
-	break;
+       {
+	 enum aarch64_simd_type mem_type
+	   = get_mem_type_for_load_store (fcode);
+	 aarch64_simd_type_info_trees simd_type
+	   = aarch64_simd_types_trees[mem_type];
+	 new_stmt = aarch64_fold_load (stmt, simd_type.itype);
+	 break;
+       }
 
       BUILTIN_VALL_F16 (STORE1, st1, 0, STORE)
       BUILTIN_VDQ_I (STORE1_U, st1, 0, STORE)
       BUILTIN_VALLP_NO_DI (STORE1_P, st1, 0, STORE)
-	if (!BYTES_BIG_ENDIAN)
-	  {
-	    enum aarch64_simd_type mem_type
-	      = get_mem_type_for_load_store(fcode);
-	    aarch64_simd_type_info_trees simd_type
-	      = aarch64_simd_types_trees[mem_type];
-	    tree elt_ptr_type = build_pointer_type_for_mode (simd_type.eltype,
-							     VOIDmode, true);
-	    tree zero = build_zero_cst (elt_ptr_type);
-	    /* Use element type alignment.  */
-	    tree access_type
-	      = build_aligned_type (simd_type.itype,
-				    TYPE_ALIGN (simd_type.eltype));
-	    new_stmt
-	      = gimple_build_assign (fold_build2 (MEM_REF, access_type,
-						  args[0], zero),
-				     args[1]);
-	    gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-	    gimple_set_vdef (new_stmt, gimple_vdef (stmt));
-	  }
-	break;
+	{
+	  enum aarch64_simd_type mem_type
+	    = get_mem_type_for_load_store (fcode);
+	  aarch64_simd_type_info_trees simd_type
+	    = aarch64_simd_types_trees[mem_type];
+	  new_stmt = aarch64_fold_store (stmt, simd_type.itype);
+	  break;
+	}
 
       BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10, ALL)
       BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
@@ -4440,6 +5136,9 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
 	}
       break;
     default:
+      if (auto builtin_data = aarch64_get_pragma_builtin (fcode))
+	new_stmt = aarch64_gimple_fold_pragma_builtin (stmt, gsi,
+						       *builtin_data);
       break;
     }
 
diff --git a/gcc/config/aarch64/aarch64-builtins.h b/gcc/config/aarch64/aarch64-builtins.h
index f4d54de..d998370 100644
--- a/gcc/config/aarch64/aarch64-builtins.h
+++ b/gcc/config/aarch64/aarch64-builtins.h
@@ -28,6 +28,8 @@ enum aarch64_type_qualifiers
   qualifier_const = 0x2, /* 1 << 1  */
   /* T *foo.  */
   qualifier_pointer = 0x4, /* 1 << 2  */
+  /* const T *foo.  */
+  qualifier_const_pointer = 0x6,
   /* Used when expanding arguments if an operand could
      be an immediate.  */
   qualifier_immediate = 0x8, /* 1 << 3  */
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 18764e4..21c7e67 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -896,6 +896,8 @@ bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
 machine_mode aarch64_sve_pred_mode (machine_mode);
+opt_machine_mode aarch64_advsimd_vector_array_mode (machine_mode,
+						    unsigned HOST_WIDE_INT);
 opt_machine_mode aarch64_sve_data_mode (scalar_mode, poly_uint64);
 bool aarch64_sve_mode_p (machine_mode);
 HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index 8924262..e725b52 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -26,6 +26,26 @@
 #define ENTRY_BINARY_LANE(N, T0, T1, T2, U, F)	\
   ENTRY (N, binary_lane, T0, T1, T2, none, U, F)
 
+#undef ENTRY_BINARY_TWO_LANES
+#define ENTRY_BINARY_TWO_LANES(N, T0, T1, T2, U, F)	\
+  ENTRY (N, binary_two_lanes, T0, T1, T2, none, U, F)
+
+#undef ENTRY_LOAD
+#define ENTRY_LOAD(N, T0, T1, U)			\
+  ENTRY (N, load, T0, T1, none, none, U, LOAD)
+
+#undef ENTRY_LOAD_LANE
+#define ENTRY_LOAD_LANE(N, T0, T1, T2, U)		\
+  ENTRY (N, load_lane, T0, T1, T2, none, U, LOAD)
+
+#undef ENTRY_STORE
+#define ENTRY_STORE(N, T0, T1, U)			\
+  ENTRY (N, store, T0, T1, none, none, U, STORE)
+
+#undef ENTRY_STORE_LANE
+#define ENTRY_STORE_LANE(N, T0, T1, U)			\
+  ENTRY (N, store_lane, T0, T1, none, none, U, STORE)
+
 #undef ENTRY_TERNARY
 #define ENTRY_TERNARY(N, T0, T1, T2, T3, U, F)	\
   ENTRY (N, ternary, T0, T1, T2, T3, U, F)
@@ -38,6 +58,10 @@
 #define ENTRY_UNARY(N, T0, T1, U, F)	\
   ENTRY (N, unary, T0, T1, none, none, U, F)
 
+#undef ENTRY_UNARY_LANE
+#define ENTRY_UNARY_LANE(N, T0, T1, U, F)	\
+  ENTRY (N, unary_lane, T0, T1, none, none, U, F)
+
 #undef ENTRY_BINARY_VHSDF
 #define ENTRY_BINARY_VHSDF(NAME, UNSPEC, FLAGS)			\
   ENTRY_BINARY (NAME##_f16, f16, f16, f16, UNSPEC, FLAGS)	\
@@ -121,6 +145,7 @@ ENTRY_BINARY_VHSDF (vamin, UNSPEC_FAMIN, FP)
 ENTRY_TERNARY_VLUT8 (p)
 ENTRY_TERNARY_VLUT8 (s)
 ENTRY_TERNARY_VLUT8 (u)
+ENTRY_TERNARY_VLUT8 (mf)
 
 ENTRY_TERNARY_VLUT16 (bf)
 ENTRY_TERNARY_VLUT16 (f)
@@ -170,3 +195,224 @@ ENTRY_FMA_FPM (vmlallbt, f32, UNSPEC_FMLALLBT_FP8)
 ENTRY_FMA_FPM (vmlalltb, f32, UNSPEC_FMLALLTB_FP8)
 ENTRY_FMA_FPM (vmlalltt, f32, UNSPEC_FMLALLTT_FP8)
 #undef REQUIRED_EXTENSIONS
+
+// bsl
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_TERNARY (vbsl_mf8, mf8, u8, mf8, mf8, UNSPEC_BSL, QUIET)
+ENTRY_TERNARY (vbslq_mf8, mf8q, u8q, mf8q, mf8q, UNSPEC_BSL, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// combine
+#define REQUIRED_EXTENSIONS nonstreaming_only (NONE)
+ENTRY_BINARY (vcombine_mf8, mf8q, mf8, mf8, UNSPEC_COMBINE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// copy_lane
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY_TWO_LANES (vcopy_lane_mf8, mf8, mf8, mf8,
+			UNSPEC_VEC_COPY, QUIET)
+ENTRY_BINARY_TWO_LANES (vcopyq_lane_mf8, mf8q, mf8q, mf8,
+			UNSPEC_VEC_COPY, QUIET)
+ENTRY_BINARY_TWO_LANES (vcopy_laneq_mf8, mf8, mf8, mf8q,
+			UNSPEC_VEC_COPY, QUIET)
+ENTRY_BINARY_TWO_LANES (vcopyq_laneq_mf8, mf8q, mf8q, mf8q,
+			UNSPEC_VEC_COPY, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// create
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY (vcreate_mf8, mf8, u64_scalar, UNSPEC_VCREATE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// dup
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY (vdup_n_mf8, mf8, mf8_scalar, UNSPEC_DUP, QUIET)
+ENTRY_UNARY (vdupq_n_mf8, mf8q, mf8_scalar, UNSPEC_DUP, QUIET)
+
+ENTRY_UNARY_LANE (vdup_lane_mf8, mf8, mf8, UNSPEC_DUP_LANE, QUIET)
+ENTRY_UNARY_LANE (vdupq_lane_mf8, mf8q, mf8, UNSPEC_DUP_LANE, QUIET)
+ENTRY_UNARY_LANE (vdup_laneq_mf8, mf8, mf8q, UNSPEC_DUP_LANE, QUIET)
+ENTRY_UNARY_LANE (vdupq_laneq_mf8, mf8q, mf8q, UNSPEC_DUP_LANE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// dupb_lane
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY_LANE (vdupb_lane_mf8, mf8_scalar, mf8, UNSPEC_GET_LANE, QUIET)
+ENTRY_UNARY_LANE (vdupb_laneq_mf8, mf8_scalar, mf8q, UNSPEC_GET_LANE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// ext
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY_LANE (vext_mf8, mf8, mf8, mf8, UNSPEC_EXT, QUIET)
+ENTRY_BINARY_LANE (vextq_mf8, mf8q, mf8q, mf8q, UNSPEC_EXT, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// ld1
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_LOAD (vld1_mf8, mf8, mf8_scalar_const_ptr, UNSPEC_LD1)
+ENTRY_LOAD (vld1q_mf8, mf8q, mf8_scalar_const_ptr, UNSPEC_LD1)
+ENTRY_LOAD (vld1_dup_mf8, mf8, mf8_scalar_const_ptr, UNSPEC_DUP)
+ENTRY_LOAD (vld1q_dup_mf8, mf8q, mf8_scalar_const_ptr, UNSPEC_DUP)
+
+ENTRY_LOAD_LANE (vld1_lane_mf8, mf8, mf8_scalar_const_ptr, mf8,
+		 UNSPEC_SET_LANE)
+ENTRY_LOAD_LANE (vld1q_lane_mf8, mf8q, mf8_scalar_const_ptr, mf8q,
+		 UNSPEC_SET_LANE)
+#undef REQUIRED_EXTENSIONS
+
+// ld<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_LOAD (vld1_mf8_x2, mf8x2, mf8_scalar_const_ptr, UNSPEC_LD1x2)
+ENTRY_LOAD (vld1q_mf8_x2, mf8qx2, mf8_scalar_const_ptr, UNSPEC_LD1x2)
+ENTRY_LOAD (vld2_mf8, mf8x2, mf8_scalar_const_ptr, UNSPEC_LD2)
+ENTRY_LOAD (vld2q_mf8, mf8qx2, mf8_scalar_const_ptr, UNSPEC_LD2)
+ENTRY_LOAD (vld2_dup_mf8, mf8x2, mf8_scalar_const_ptr, UNSPEC_LD2_DUP)
+ENTRY_LOAD (vld2q_dup_mf8, mf8qx2, mf8_scalar_const_ptr, UNSPEC_LD2_DUP)
+ENTRY_LOAD_LANE (vld2_lane_mf8, mf8x2, mf8_scalar_const_ptr, mf8x2,
+		 UNSPEC_LD2_LANE)
+ENTRY_LOAD_LANE (vld2q_lane_mf8, mf8qx2, mf8_scalar_const_ptr, mf8qx2,
+		 UNSPEC_LD2_LANE)
+
+ENTRY_LOAD (vld1_mf8_x3, mf8x3, mf8_scalar_const_ptr, UNSPEC_LD1x3)
+ENTRY_LOAD (vld1q_mf8_x3, mf8qx3, mf8_scalar_const_ptr, UNSPEC_LD1x3)
+ENTRY_LOAD (vld3_mf8, mf8x3, mf8_scalar_const_ptr, UNSPEC_LD3)
+ENTRY_LOAD (vld3q_mf8, mf8qx3, mf8_scalar_const_ptr, UNSPEC_LD3)
+ENTRY_LOAD (vld3_dup_mf8, mf8x3, mf8_scalar_const_ptr, UNSPEC_LD3_DUP)
+ENTRY_LOAD (vld3q_dup_mf8, mf8qx3, mf8_scalar_const_ptr, UNSPEC_LD3_DUP)
+ENTRY_LOAD_LANE (vld3_lane_mf8, mf8x3, mf8_scalar_const_ptr, mf8x3,
+		 UNSPEC_LD3_LANE)
+ENTRY_LOAD_LANE (vld3q_lane_mf8, mf8qx3, mf8_scalar_const_ptr, mf8qx3,
+		 UNSPEC_LD3_LANE)
+
+ENTRY_LOAD (vld1_mf8_x4, mf8x4, mf8_scalar_const_ptr, UNSPEC_LD1x4)
+ENTRY_LOAD (vld1q_mf8_x4, mf8qx4, mf8_scalar_const_ptr, UNSPEC_LD1x4)
+ENTRY_LOAD (vld4_mf8, mf8x4, mf8_scalar_const_ptr, UNSPEC_LD4)
+ENTRY_LOAD (vld4q_mf8, mf8qx4, mf8_scalar_const_ptr, UNSPEC_LD4)
+ENTRY_LOAD (vld4_dup_mf8, mf8x4, mf8_scalar_const_ptr, UNSPEC_LD4_DUP)
+ENTRY_LOAD (vld4q_dup_mf8, mf8qx4, mf8_scalar_const_ptr, UNSPEC_LD4_DUP)
+ENTRY_LOAD_LANE (vld4_lane_mf8, mf8x4, mf8_scalar_const_ptr, mf8x4,
+		 UNSPEC_LD4_LANE)
+ENTRY_LOAD_LANE (vld4q_lane_mf8, mf8qx4, mf8_scalar_const_ptr, mf8qx4,
+		 UNSPEC_LD4_LANE)
+#undef REQUIRED_EXTENSIONS
+
+// mov
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY (vmov_n_mf8, mf8, mf8_scalar, UNSPEC_DUP, QUIET)
+ENTRY_UNARY (vmovq_n_mf8, mf8q, mf8_scalar, UNSPEC_DUP, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// rev
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_UNARY (vrev64_mf8, mf8, mf8, UNSPEC_REV64, QUIET)
+ENTRY_UNARY (vrev64q_mf8, mf8q, mf8q, UNSPEC_REV64, QUIET)
+
+ENTRY_UNARY (vrev32_mf8, mf8, mf8, UNSPEC_REV32, QUIET)
+ENTRY_UNARY (vrev32q_mf8, mf8q, mf8q, UNSPEC_REV32, QUIET)
+
+ENTRY_UNARY (vrev16_mf8, mf8, mf8, UNSPEC_REV16, QUIET)
+ENTRY_UNARY (vrev16q_mf8, mf8q, mf8q, UNSPEC_REV16, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// set_lane
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY_LANE (vset_lane_mf8, mf8, mf8_scalar, mf8, UNSPEC_SET_LANE, QUIET)
+ENTRY_BINARY_LANE (vsetq_lane_mf8, mf8q, mf8_scalar, mf8q, UNSPEC_SET_LANE, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// st1
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_STORE (vst1_mf8, mf8_scalar_ptr, mf8, UNSPEC_ST1)
+ENTRY_STORE (vst1q_mf8, mf8_scalar_ptr, mf8q, UNSPEC_ST1)
+
+ENTRY_STORE_LANE (vst1_lane_mf8, mf8_scalar_ptr, mf8, UNSPEC_ST1_LANE)
+ENTRY_STORE_LANE (vst1q_lane_mf8, mf8_scalar_ptr, mf8q, UNSPEC_ST1_LANE)
+#undef REQUIRED_EXTENSIONS
+
+// st<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_STORE (vst2_mf8, mf8_scalar_ptr, mf8x2, UNSPEC_ST2)
+ENTRY_STORE (vst2q_mf8, mf8_scalar_ptr, mf8qx2, UNSPEC_ST2)
+ENTRY_STORE (vst1_mf8_x2, mf8_scalar_ptr, mf8x2, UNSPEC_ST1x2)
+ENTRY_STORE (vst1q_mf8_x2, mf8_scalar_ptr, mf8qx2, UNSPEC_ST1x2)
+ENTRY_STORE_LANE (vst2_lane_mf8, mf8_scalar_ptr, mf8x2, UNSPEC_ST2_LANE)
+ENTRY_STORE_LANE (vst2q_lane_mf8, mf8_scalar_ptr, mf8qx2, UNSPEC_ST2_LANE)
+
+ENTRY_STORE (vst3_mf8, mf8_scalar_ptr, mf8x3, UNSPEC_ST3)
+ENTRY_STORE (vst3q_mf8, mf8_scalar_ptr, mf8qx3, UNSPEC_ST3)
+ENTRY_STORE (vst1_mf8_x3, mf8_scalar_ptr, mf8x3, UNSPEC_ST1x3)
+ENTRY_STORE (vst1q_mf8_x3, mf8_scalar_ptr, mf8qx3, UNSPEC_ST1x3)
+ENTRY_STORE_LANE (vst3_lane_mf8, mf8_scalar_ptr, mf8x3, UNSPEC_ST3_LANE)
+ENTRY_STORE_LANE (vst3q_lane_mf8, mf8_scalar_ptr, mf8qx3, UNSPEC_ST3_LANE)
+
+ENTRY_STORE (vst4_mf8, mf8_scalar_ptr, mf8x4, UNSPEC_ST4)
+ENTRY_STORE (vst4q_mf8, mf8_scalar_ptr, mf8qx4, UNSPEC_ST4)
+ENTRY_STORE (vst1_mf8_x4, mf8_scalar_ptr, mf8x4, UNSPEC_ST1x4)
+ENTRY_STORE (vst1q_mf8_x4, mf8_scalar_ptr, mf8qx4, UNSPEC_ST1x4)
+ENTRY_STORE_LANE (vst4_lane_mf8, mf8_scalar_ptr, mf8x4, UNSPEC_ST4_LANE)
+ENTRY_STORE_LANE (vst4q_lane_mf8, mf8_scalar_ptr, mf8qx4, UNSPEC_ST4_LANE)
+#undef REQUIRED_EXTENSIONS
+
+// tbl<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY (vtbl1_mf8, mf8, mf8, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vtbl2_mf8, mf8, mf8x2, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vtbl3_mf8, mf8, mf8x3, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vtbl4_mf8, mf8, mf8x4, u8, UNSPEC_TBL, QUIET)
+
+ENTRY_BINARY (vqtbl1_mf8, mf8, mf8q, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl1q_mf8, mf8q, mf8q, u8q, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl2_mf8, mf8, mf8qx2, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl2q_mf8, mf8q, mf8qx2, u8q, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl3_mf8, mf8, mf8qx3, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl3q_mf8, mf8q, mf8qx3, u8q, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl4_mf8, mf8, mf8qx4, u8, UNSPEC_TBL, QUIET)
+ENTRY_BINARY (vqtbl4q_mf8, mf8q, mf8qx4, u8q, UNSPEC_TBL, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// tbx<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_TERNARY (vtbx1_mf8, mf8, mf8, mf8, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vtbx2_mf8, mf8, mf8, mf8x2, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vtbx3_mf8, mf8, mf8, mf8x3, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vtbx4_mf8, mf8, mf8, mf8x4, u8, UNSPEC_TBX, QUIET)
+
+ENTRY_TERNARY (vqtbx1_mf8, mf8, mf8, mf8q, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx1q_mf8, mf8q, mf8q, mf8q, u8q, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx2_mf8, mf8, mf8, mf8qx2, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx2q_mf8, mf8q, mf8q, mf8qx2, u8q, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx3_mf8, mf8, mf8, mf8qx3, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx3q_mf8, mf8q, mf8q, mf8qx3, u8q, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx4_mf8, mf8, mf8, mf8qx4, u8, UNSPEC_TBX, QUIET)
+ENTRY_TERNARY (vqtbx4q_mf8, mf8q, mf8q, mf8qx4, u8q, UNSPEC_TBX, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// trn<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY (vtrn1_mf8, mf8, mf8, mf8, UNSPEC_TRN1, QUIET)
+ENTRY_BINARY (vtrn1q_mf8, mf8q, mf8q, mf8q, UNSPEC_TRN1, QUIET)
+ENTRY_BINARY (vtrn2_mf8, mf8, mf8, mf8, UNSPEC_TRN2, QUIET)
+ENTRY_BINARY (vtrn2q_mf8, mf8q, mf8q, mf8q, UNSPEC_TRN2, QUIET)
+ENTRY_BINARY (vtrn_mf8, mf8x2, mf8, mf8, UNSPEC_TRN, QUIET)
+ENTRY_BINARY (vtrnq_mf8, mf8qx2, mf8q, mf8q, UNSPEC_TRN, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// uzp<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY (vuzp1_mf8, mf8, mf8, mf8, UNSPEC_UZP1, QUIET)
+ENTRY_BINARY (vuzp1q_mf8, mf8q, mf8q, mf8q, UNSPEC_UZP1, QUIET)
+ENTRY_BINARY (vuzp2_mf8, mf8, mf8, mf8, UNSPEC_UZP2, QUIET)
+ENTRY_BINARY (vuzp2q_mf8, mf8q, mf8q, mf8q, UNSPEC_UZP2, QUIET)
+ENTRY_BINARY (vuzp_mf8, mf8x2, mf8, mf8, UNSPEC_UZP, QUIET)
+ENTRY_BINARY (vuzpq_mf8, mf8qx2, mf8q, mf8q, UNSPEC_UZP, QUIET)
+#undef REQUIRED_EXTENSIONS
+
+// zip<n>
+#define REQUIRED_EXTENSIONS nonstreaming_only (TARGET_SIMD)
+ENTRY_BINARY (vzip1_mf8, mf8, mf8, mf8, UNSPEC_ZIP1, QUIET)
+ENTRY_BINARY (vzip1q_mf8, mf8q, mf8q, mf8q, UNSPEC_ZIP1, QUIET)
+ENTRY_BINARY (vzip2_mf8, mf8, mf8, mf8, UNSPEC_ZIP2, QUIET)
+ENTRY_BINARY (vzip2q_mf8, mf8q, mf8q, mf8q, UNSPEC_ZIP2, QUIET)
+ENTRY_BINARY (vzip_mf8, mf8x2, mf8, mf8, UNSPEC_ZIP, QUIET)
+ENTRY_BINARY (vzipq_mf8, mf8qx2, mf8q, mf8q, UNSPEC_ZIP, QUIET)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7959cca..237de1b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -112,7 +112,7 @@
   }
 )
 
-(define_insn "aarch64_dup_lane<mode>"
+(define_insn "@aarch64_dup_lane<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_duplicate:VALL_F16
 	  (vec_select:<VEL>
@@ -127,7 +127,7 @@
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn "aarch64_dup_lane_<vswap_width_name><mode>"
+(define_insn "@aarch64_dup_lane_<vswap_width_name><mode>"
   [(set (match_operand:VALL_F16_NO_V2Q 0 "register_operand" "=w")
 	(vec_duplicate:VALL_F16_NO_V2Q
 	  (vec_select:<VEL>
@@ -1164,7 +1164,7 @@
   [(set_attr "type" "neon_logic<q>")]
 )
 
-(define_insn "aarch64_simd_vec_set<mode>"
+(define_insn "@aarch64_simd_vec_set<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
 	(vec_merge:VALL_F16
 	    (vec_duplicate:VALL_F16
@@ -1225,7 +1225,7 @@
   [(set_attr "type" "neon_ins<q>")]
 )
 
-(define_insn "*aarch64_simd_vec_copy_lane_<vswap_width_name><mode>"
+(define_insn "@aarch64_simd_vec_copy_lane_<vswap_width_name><mode>"
   [(set (match_operand:VALL_F16_NO_V2Q 0 "register_operand" "=w")
 	(vec_merge:VALL_F16_NO_V2Q
 	    (vec_duplicate:VALL_F16_NO_V2Q
@@ -3837,7 +3837,7 @@
 }
 )
 
-(define_expand "aarch64_simd_bsl<mode>"
+(define_expand "@aarch64_simd_bsl<mode>"
   [(match_operand:VALLDIF 0 "register_operand")
    (match_operand:<V_INT_EQUIV> 1 "register_operand")
    (match_operand:VALLDIF 2 "register_operand")
@@ -4438,7 +4438,7 @@
 ;; Form a vector whose least significant half comes from operand 1 and whose
 ;; most significant half comes from operand 2.  This operand order follows
 ;; arm_neon.h vcombine* intrinsics.
-(define_expand "aarch64_combine<mode>"
+(define_expand "@aarch64_combine<mode>"
   [(match_operand:<VDBL> 0 "register_operand")
    (match_operand:VDC 1 "general_operand")
    (match_operand:VDC 2 "general_operand")]
@@ -6971,7 +6971,7 @@
 ;; Note, we have constraints for Dz and Z as different expanders
 ;; have different ideas of what should be passed to this pattern.
 
-(define_insn "aarch64_cm<optab><mode><vczle><vczbe>"
+(define_insn "@aarch64_cm<optab><mode><vczle><vczbe>"
   [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
 	(neg:<V_INT_EQUIV>
 	  (COMPARISONS:<V_INT_EQUIV>
@@ -7036,7 +7036,7 @@
 
 ;; cm(hs|hi)
 
-(define_insn "aarch64_cm<optab><mode><vczle><vczbe>"
+(define_insn "@aarch64_cm<optab><mode><vczle><vczbe>"
   [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
 	(neg:<V_INT_EQUIV>
 	  (UCOMPARISONS:<V_INT_EQUIV>
@@ -7188,7 +7188,7 @@
 
 ;; fcm(eq|ge|gt|le|lt)
 
-(define_insn "aarch64_cm<optab><mode><vczle><vczbe>"
+(define_insn "@aarch64_cm<optab><mode><vczle><vczbe>"
   [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
 	(neg:<V_INT_EQUIV>
 	  (COMPARISONS:<V_INT_EQUIV>
@@ -7349,7 +7349,7 @@
   [(set_attr "type" "neon_load2_2reg<q>")]
 )
 
-(define_insn "aarch64_simd_ld2r<vstruct_elt>"
+(define_insn "@aarch64_simd_ld2r<vstruct_elt>"
   [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2QD [
 	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
@@ -7359,7 +7359,7 @@
   [(set_attr "type" "neon_load2_all_lanes<q>")]
 )
 
-(define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
+(define_insn "@aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
   [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2QD [
 		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
@@ -7449,7 +7449,7 @@
   [(set_attr "type" "neon_load3_3reg<q>")]
 )
 
-(define_insn "aarch64_simd_ld3r<vstruct_elt>"
+(define_insn "@aarch64_simd_ld3r<vstruct_elt>"
   [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3QD [
 	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
@@ -7549,7 +7549,7 @@
   [(set_attr "type" "neon_load4_4reg<q>")]
 )
 
-(define_insn "aarch64_simd_ld4r<vstruct_elt>"
+(define_insn "@aarch64_simd_ld4r<vstruct_elt>"
   [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4QD [
 	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
@@ -7773,7 +7773,7 @@
     operands[1] = force_reg (V8DImode, operands[1]);
 })
 
-(define_expand "aarch64_ld1x3<vstruct_elt>"
+(define_expand "@aarch64_ld1x3<vstruct_elt>"
   [(match_operand:VSTRUCT_3QD 0 "register_operand")
    (match_operand:DI 1 "register_operand")]
   "TARGET_SIMD"
@@ -7793,7 +7793,7 @@
   [(set_attr "type" "neon_load1_3reg<q>")]
 )
 
-(define_expand "aarch64_ld1x4<vstruct_elt>"
+(define_expand "@aarch64_ld1x4<vstruct_elt>"
   [(match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
    (match_operand:DI 1 "register_operand" "r")]
   "TARGET_SIMD"
@@ -7813,7 +7813,7 @@
   [(set_attr "type" "neon_load1_4reg<q>")]
 )
 
-(define_expand "aarch64_st1x2<vstruct_elt>"
+(define_expand "@aarch64_st1x2<vstruct_elt>"
   [(match_operand:DI 0 "register_operand")
    (match_operand:VSTRUCT_2QD 1 "register_operand")]
   "TARGET_SIMD"
@@ -7833,7 +7833,7 @@
   [(set_attr "type" "neon_store1_2reg<q>")]
 )
 
-(define_expand "aarch64_st1x3<vstruct_elt>"
+(define_expand "@aarch64_st1x3<vstruct_elt>"
   [(match_operand:DI 0 "register_operand")
    (match_operand:VSTRUCT_3QD 1 "register_operand")]
   "TARGET_SIMD"
@@ -7853,7 +7853,7 @@
   [(set_attr "type" "neon_store1_3reg<q>")]
 )
 
-(define_expand "aarch64_st1x4<vstruct_elt>"
+(define_expand "@aarch64_st1x4<vstruct_elt>"
   [(match_operand:DI 0 "register_operand" "")
    (match_operand:VSTRUCT_4QD 1 "register_operand" "")]
   "TARGET_SIMD"
@@ -8220,7 +8220,7 @@
   [(set_attr "type" "neon_load1_4reg<q>")]
 )
 
-(define_expand "aarch64_ld<nregs><vstruct_elt>"
+(define_expand "@aarch64_ld<nregs><vstruct_elt>"
  [(match_operand:VSTRUCT_D 0 "register_operand")
   (match_operand:DI 1 "register_operand")]
   "TARGET_SIMD"
@@ -8230,7 +8230,7 @@
   DONE;
 })
 
-(define_expand "aarch64_ld1<VALL_F16:mode>"
+(define_expand "@aarch64_ld1<VALL_F16:mode>"
  [(match_operand:VALL_F16 0 "register_operand")
   (match_operand:DI 1 "register_operand")]
   "TARGET_SIMD"
@@ -8245,7 +8245,7 @@
   DONE;
 })
 
-(define_expand "aarch64_ld<nregs><vstruct_elt>"
+(define_expand "@aarch64_ld<nregs><vstruct_elt>"
  [(match_operand:VSTRUCT_Q 0 "register_operand")
   (match_operand:DI 1 "register_operand")]
   "TARGET_SIMD"
@@ -8255,7 +8255,7 @@
   DONE;
 })
 
-(define_expand "aarch64_ld1x2<vstruct_elt>"
+(define_expand "@aarch64_ld1x2<vstruct_elt>"
  [(match_operand:VSTRUCT_2QD 0 "register_operand")
   (match_operand:DI 1 "register_operand")]
   "TARGET_SIMD"
@@ -8267,7 +8267,7 @@
   DONE;
 })
 
-(define_expand "aarch64_ld<nregs>_lane<vstruct_elt>"
+(define_expand "@aarch64_ld<nregs>_lane<vstruct_elt>"
   [(match_operand:VSTRUCT_QD 0 "register_operand")
 	(match_operand:DI 1 "register_operand")
 	(match_operand:VSTRUCT_QD 2 "register_operand")
@@ -8411,7 +8411,7 @@
 ;; This instruction's pattern is generated directly by
 ;; aarch64_expand_vec_perm_const, so any changes to the pattern would
 ;; need corresponding changes there.
-(define_insn "aarch64_<PERMUTE:perm_insn><mode><vczle><vczbe>"
+(define_insn "@aarch64_<PERMUTE:perm_insn><mode><vczle><vczbe>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
 			  (match_operand:VALL_F16 2 "register_operand" "w")]
@@ -8437,7 +8437,7 @@
 ;; aarch64_expand_vec_perm_const, so any changes to the pattern would
 ;; need corresponding changes there.  Note that the immediate (third)
 ;; operand is a lane index not a byte index.
-(define_insn "aarch64_ext<mode>"
+(define_insn "@aarch64_ext<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
         (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
 			  (match_operand:VALL_F16 2 "register_operand" "w")
@@ -8455,7 +8455,7 @@
 ;; This instruction's pattern is generated directly by
 ;; aarch64_expand_vec_perm_const, so any changes to the pattern would
 ;; need corresponding changes there.
-(define_insn "aarch64_rev<REVERSE:rev_op><mode><vczle><vczbe>"
+(define_insn "@aarch64_rev<REVERSE:rev_op><mode><vczle><vczbe>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")]
                     REVERSE))]
@@ -8524,7 +8524,7 @@
   [(set_attr "type" "neon_store1_4reg")]
 )
 
-(define_expand "aarch64_st<nregs><vstruct_elt>"
+(define_expand "@aarch64_st<nregs><vstruct_elt>"
  [(match_operand:DI 0 "register_operand")
   (match_operand:VSTRUCT_D 1 "register_operand")]
   "TARGET_SIMD"
@@ -8534,7 +8534,7 @@
   DONE;
 })
 
-(define_expand "aarch64_st<nregs><vstruct_elt>"
+(define_expand "@aarch64_st<nregs><vstruct_elt>"
  [(match_operand:DI 0 "register_operand")
   (match_operand:VSTRUCT_Q 1 "register_operand")]
   "TARGET_SIMD"
@@ -8544,7 +8544,7 @@
   DONE;
 })
 
-(define_expand "aarch64_st<nregs>_lane<vstruct_elt>"
+(define_expand "@aarch64_st<nregs>_lane<vstruct_elt>"
  [(match_operand:DI 0 "register_operand")
   (match_operand:VSTRUCT_QD 1 "register_operand")
   (match_operand:SI 2 "immediate_operand")]
@@ -8560,7 +8560,7 @@
   DONE;
 })
 
-(define_expand "aarch64_st1<VALL_F16:mode>"
+(define_expand "@aarch64_st1<VALL_F16:mode>"
  [(match_operand:DI 0 "register_operand")
   (match_operand:VALL_F16 1 "register_operand")]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 41cc2ee..6bb4bdf 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1802,7 +1802,7 @@ aarch64_ldn_stn_vectors (machine_mode mode)
 
 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
    corresponding vector structure mode.  */
-static opt_machine_mode
+opt_machine_mode
 aarch64_advsimd_vector_array_mode (machine_mode mode,
 				   unsigned HOST_WIDE_INT nelems)
 {
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index edac1ae..c62de38 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -198,8 +198,10 @@
     UNSPEC_AUTIB1716
     UNSPEC_AUTIASP
     UNSPEC_AUTIBSP
+    UNSPEC_BSL
     UNSPEC_CALLEE_ABI
     UNSPEC_CASESI
+    UNSPEC_COMBINE
     UNSPEC_CPYMEM
     UNSPEC_CRC32B
     UNSPEC_CRC32CB
@@ -209,6 +211,8 @@
     UNSPEC_CRC32H
     UNSPEC_CRC32W
     UNSPEC_CRC32X
+    UNSPEC_DUP
+    UNSPEC_DUP_LANE
     UNSPEC_FCVTZS
     UNSPEC_FCVTZU
     UNSPEC_FJCVTZS
@@ -227,6 +231,7 @@
     UNSPEC_FRINTP
     UNSPEC_FRINTX
     UNSPEC_FRINTZ
+    UNSPEC_GET_LANE
     UNSPEC_GOTSMALLPIC
     UNSPEC_GOTSMALLPIC28K
     UNSPEC_GOTSMALLTLS
@@ -236,6 +241,10 @@
     UNSPEC_LDP_FST
     UNSPEC_LDP_SND
     UNSPEC_LD1
+    UNSPEC_LD1_DUP
+    UNSPEC_LD1x2
+    UNSPEC_LD1x3
+    UNSPEC_LD1x4
     UNSPEC_LD2
     UNSPEC_LD2_DREG
     UNSPEC_LD2_DUP
@@ -265,12 +274,17 @@
     UNSPEC_REV
     UNSPEC_SADALP
     UNSPEC_SCVTF
+    UNSPEC_SET_LANE
     UNSPEC_SETMEM
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
     UNSPEC_SISD_USHL
     UNSPEC_SSHL_2S
     UNSPEC_ST1
+    UNSPEC_ST1_LANE
+    UNSPEC_ST1x2
+    UNSPEC_ST1x3
+    UNSPEC_ST1x4
     UNSPEC_ST2
     UNSPEC_ST3
     UNSPEC_ST4
@@ -314,6 +328,8 @@
     UNSPEC_UNPACKSLO
     UNSPEC_UNPACKULO
     UNSPEC_PACK
+    UNSPEC_VCREATE
+    UNSPEC_VEC_COPY
     UNSPEC_WHILEGE
     UNSPEC_WHILEGT
     UNSPEC_WHILEHI
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 34200b0..07b9754 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1095,6 +1095,7 @@
     UNSPEC_SUBHNB	; Used in aarch64-sve2.md.
     UNSPEC_SUBHNT	; Used in aarch64-sve2.md.
     UNSPEC_TBL2		; Used in aarch64-sve2.md.
+    UNSPEC_TRN		; Used in aarch64-builtins.cc
     UNSPEC_UABDLB	; Used in aarch64-sve2.md.
     UNSPEC_UABDLT	; Used in aarch64-sve2.md.
     UNSPEC_UADDLB	; Used in aarch64-sve2.md.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
index 6f4d62b..567ca2a8 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -7,6 +7,7 @@
 #include <inttypes.h>
 
 /* helper type, to help write floating point results in integer form.  */
+typedef uint8_t hmfloat8_t;
 typedef uint16_t hfloat16_t;
 typedef uint32_t hfloat32_t;
 typedef uint64_t hfloat64_t;
@@ -38,10 +39,24 @@ extern size_t strlen(const char *);
    Use this macro to guard against them.  */
 #ifdef __aarch64__
 #define AARCH64_ONLY(X) X
+#define MFLOAT8_SUPPORTED 1
 #else
 #define AARCH64_ONLY(X)
+#define MFLOAT8_SUPPORTED 0
 #endif
 
+#if MFLOAT8_SUPPORTED
+#define MFLOAT8_ONLY(X) X
+#define MFLOAT8(X) (((union { uint8_t x; mfloat8_t y; }) { X }).y)
+#define CONVERT(T, X) \
+  ((T) _Generic ((T){}, mfloat8_t: MFLOAT8(X), default: X))
+#else
+#define MFLOAT8_ONLY(X)
+#define CONVERT(T, X) ((T) X)
+#endif
+
+#define BITEQUAL(X, Y) (__builtin_memcmp (&X, &Y, sizeof(X)) == 0)
+
 #define xSTR(X) #X
 #define STR(X) xSTR(X)
 
@@ -182,6 +197,9 @@ static ARRAY(result, poly, 16, 4);
 #if defined (__ARM_FEATURE_CRYPTO)
 static ARRAY(result, poly, 64, 1);
 #endif
+#if MFLOAT8_SUPPORTED
+static ARRAY(result, mfloat, 8, 8);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 static ARRAY(result, float, 16, 4);
 #endif
@@ -202,6 +220,9 @@ static ARRAY(result, poly, 16, 8);
 #if defined (__ARM_FEATURE_CRYPTO)
 static ARRAY(result, poly, 64, 2);
 #endif
+#if MFLOAT8_SUPPORTED
+static ARRAY(result, mfloat, 8, 16);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 static ARRAY(result, float, 16, 8);
 #endif
@@ -222,6 +243,9 @@ extern ARRAY(expected, uint, 32, 2);
 extern ARRAY(expected, uint, 64, 1);
 extern ARRAY(expected, poly, 8, 8);
 extern ARRAY(expected, poly, 16, 4);
+#if MFLOAT8_SUPPORTED
+extern ARRAY(expected, hmfloat, 8, 8);
+#endif
 extern ARRAY(expected, hfloat, 16, 4);
 extern ARRAY(expected, hfloat, 32, 2);
 extern ARRAY(expected, hfloat, 64, 1);
@@ -235,6 +259,9 @@ extern ARRAY(expected, uint, 32, 4);
 extern ARRAY(expected, uint, 64, 2);
 extern ARRAY(expected, poly, 8, 16);
 extern ARRAY(expected, poly, 16, 8);
+#if MFLOAT8_SUPPORTED
+extern ARRAY(expected, hmfloat, 8, 16);
+#endif
 extern ARRAY(expected, hfloat, 16, 8);
 extern ARRAY(expected, hfloat, 32, 4);
 extern ARRAY(expected, hfloat, 64, 2);
@@ -251,6 +278,8 @@ extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
     CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
+    MFLOAT8_ONLY(CHECK_FP(test_name, mfloat, 8, 8, PRIx8,		\
+			  EXPECTED, comment);)				\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
@@ -263,6 +292,8 @@ extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 64, 2, PRIx64, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);	\
     CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);	\
+    MFLOAT8_ONLY(CHECK_FP(test_name, mfloat, 8, 16, PRIx8,		\
+			  EXPECTED, comment);)				\
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment);	\
   }									\
 
@@ -372,6 +403,9 @@ static void clean_results (void)
 #if defined (__ARM_FEATURE_CRYPTO)
   CLEAN(result, poly, 64, 1);
 #endif
+#if MFLOAT8_SUPPORTED
+  CLEAN(result, mfloat, 8, 8);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CLEAN(result, float, 16, 4);
 #endif
@@ -390,6 +424,9 @@ static void clean_results (void)
 #if defined (__ARM_FEATURE_CRYPTO)
   CLEAN(result, poly, 64, 2);
 #endif
+#if MFLOAT8_SUPPORTED
+  CLEAN(result, mfloat, 8, 16);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CLEAN(result, float, 16, 8);
 #endif
@@ -460,6 +497,7 @@ static void clean_results (void)
   DECL_VARIABLE(VAR, poly, 8, 8);		\
   DECL_VARIABLE(VAR, poly, 16, 4);		\
   DECL_VARIABLE_CRYPTO(VAR, poly, 64, 1);	\
+  MFLOAT8_ONLY(DECL_VARIABLE(VAR, mfloat, 8, 8);) \
   DECL_VARIABLE(VAR, float, 16, 4);		\
   DECL_VARIABLE(VAR, float, 32, 2)
 #else
@@ -480,6 +518,7 @@ static void clean_results (void)
   DECL_VARIABLE(VAR, poly, 8, 16);		\
   DECL_VARIABLE(VAR, poly, 16, 8);		\
   DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
+  MFLOAT8_ONLY(DECL_VARIABLE(VAR, mfloat, 8, 16);) \
   DECL_VARIABLE(VAR, float, 16, 8);		\
   DECL_VARIABLE(VAR, float, 32, 4);		\
   AARCH64_ONLY(DECL_VARIABLE(VAR, float, 64, 2))
@@ -490,6 +529,7 @@ static void clean_results (void)
   DECL_VARIABLE(VAR, poly, 8, 16);		\
   DECL_VARIABLE(VAR, poly, 16, 8);		\
   DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
+  MFLOAT8_ONLY(DECL_VARIABLE(VAR, mfloat, 8, 16);) \
   DECL_VARIABLE(VAR, float, 32, 4);		\
   AARCH64_ONLY(DECL_VARIABLE(VAR, float, 64, 2))
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
index f8c4aef..7666ae0 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
@@ -122,6 +122,10 @@ PAD(buffer_pad, uint, 64, 1);
 VECT_VAR_DECL_INIT(buffer, poly, 64, 1);
 PAD(buffer_pad, poly, 64, 1);
 #endif
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer, mfloat, 8, 8)[8];
+PAD(buffer_pad, mfloat, 8, 8);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer, float, 16, 4);
 PAD(buffer_pad, float, 16, 4);
@@ -152,6 +156,10 @@ PAD(buffer_pad, poly, 16, 8);
 VECT_VAR_DECL_INIT(buffer, poly, 64, 2);
 PAD(buffer_pad, poly, 64, 2);
 #endif
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer, mfloat, 8, 16)[16];
+PAD(buffer_pad, mfloat, 8, 16);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer, float, 16, 8);
 PAD(buffer_pad, float, 16, 8);
@@ -190,6 +198,10 @@ VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4);
 VECT_VAR_DECL_INIT4(buffer_dup, poly, 64, 1);
 VECT_VAR_DECL(buffer_dup_pad, poly, 64, 1);
 #endif
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer_dup, mfloat, 8, 8)[8];
+PAD(buffer_dup_pad, mfloat, 8, 8);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT4(buffer_dup, float, 16, 4);
 VECT_VAR_DECL(buffer_dup_pad, float, 16, 4);
@@ -221,9 +233,26 @@ VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8);
 VECT_VAR_DECL_INIT4(buffer_dup, poly, 64, 2);
 VECT_VAR_DECL(buffer_dup_pad, poly, 64, 2);
 #endif
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer_dup, mfloat, 8, 16)[16];
+PAD(buffer_dup_pad, mfloat, 8, 16);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_dup, float, 16, 8);
 VECT_VAR_DECL(buffer_dup_pad, float, 16, 8);
 #endif
 VECT_VAR_DECL_INIT(buffer_dup, float, 32, 4);
 VECT_VAR_DECL(buffer_dup_pad, float, 32, 4);
+
+#if MFLOAT8_SUPPORTED
+static void __attribute__((constructor))
+copy_mfloat8 ()
+{
+  memcpy (VECT_VAR(buffer, mfloat, 8, 8), VECT_VAR(buffer, uint, 8, 8), 8);
+  memcpy (VECT_VAR(buffer, mfloat, 8, 16), VECT_VAR(buffer, uint, 8, 16), 16);
+  memcpy (VECT_VAR(buffer_dup, mfloat, 8, 8),
+	  VECT_VAR(buffer_dup, uint, 8, 8), 8);
+  memcpy (VECT_VAR(buffer_dup, mfloat, 8, 16),
+	  VECT_VAR(buffer_dup, uint, 8, 16), 16);
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
index e9b3dfd..4c50acc 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffff1 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
 					0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff2, 0xfff2 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xfa, 0xfa, 0xfa, 0xfa,
+					   0xfe, 0xfe, 0xfe, 0xfe };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc09, 0xcb89,
 					       0xcb09, 0xca89 };
@@ -47,6 +51,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
 					 0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff2, 0xfff2,
 					 0xfff4, 0xfff4, 0xfff6, 0xfff6 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
+					    0xf5, 0xf5, 0xf5, 0xf5,
+					    0xf1, 0xf1, 0xf1, 0xf1,
+					    0xf5, 0xf5, 0xf5, 0xf5 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc09, 0xcb89,
 					       0xcb09, 0xca89,
@@ -76,6 +86,10 @@ void exec_vbsl (void)
   clean_results ();
 
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if MFLOAT8_SUPPORTED
+  VLOAD(vector, buffer, , mfloat, mf, 8, 8);
+  VLOAD(vector, buffer, q, mfloat, mf, 8, 16);
+#endif
 #if defined (FP16_SUPPORTED)
   VLOAD(vector, buffer, , float, f, 16, 4);
   VLOAD(vector, buffer, q, float, f, 16, 8);
@@ -94,6 +108,7 @@ void exec_vbsl (void)
   VDUP(vector2, , uint, u, 16, 4, 0xFFF2);
   VDUP(vector2, , uint, u, 32, 2, 0xFFFFFFF0);
   VDUP(vector2, , uint, u, 64, 1, 0xFFFFFFF3);
+  MFLOAT8_ONLY(VDUP(vector2, , mfloat, mf, 8, 8, MFLOAT8(0xca));)
 #if defined (FP16_SUPPORTED)
   VDUP(vector2, , float, f, 16, 4, -2.4f);   /* -2.4f is 0xC0CD.  */
 #endif
@@ -111,6 +126,7 @@ void exec_vbsl (void)
   VDUP(vector2, q, uint, u, 64, 2, 0xFFFFFFF3);
   VDUP(vector2, q, poly, p, 8, 16, 0xF3);
   VDUP(vector2, q, poly, p, 16, 8, 0xFFF2);
+  MFLOAT8_ONLY(VDUP(vector2, q, mfloat, mf, 8, 16, MFLOAT8(0x55));)
 #if defined (FP16_SUPPORTED)
   VDUP(vector2, q, float, f, 16, 8, -2.4f);
 #endif
@@ -131,6 +147,10 @@ void exec_vbsl (void)
   TEST_VBSL(uint, , poly, p, 16, 4);
   TEST_VBSL(uint, q, poly, p, 8, 16);
   TEST_VBSL(uint, q, poly, p, 16, 8);
+#if MFLOAT8_SUPPORTED
+  TEST_VBSL(uint, , mfloat, mf, 8, 8);
+  TEST_VBSL(uint, q, mfloat, mf, 8, 16);
+#endif
 #if defined (FP16_SUPPORTED)
   TEST_VBSL(uint, , float, f, 16, 4);
   TEST_VBSL(uint, q, float, f, 16, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
index e9d31d6..05933f9 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
@@ -25,6 +25,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0x66, 0x66, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					    0xf4, 0xf5, 0xf6, 0xf7,
+					    0xcc, 0xcc, 0xcc, 0xcc,
+					    0xcc, 0xcc, 0xcc, 0xcc };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					   0x40533333, 0x40533333 };
 VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
@@ -46,6 +52,7 @@ void exec_vcombine (void)
 
   /* Initialize input "vector64_a" from "buffer".  */
   TEST_MACRO_64BITS_VARIANTS_2_5(VLOAD, vector64_a, buffer);
+  MFLOAT8_ONLY(VLOAD(vector64_a, buffer, , mfloat, mf, 8, 8);)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VLOAD(vector64_a, buffer, , float, f, 16, 4);
 #endif
@@ -62,6 +69,7 @@ void exec_vcombine (void)
   VDUP(vector64_b, , uint, u, 64, 1, 0x88);
   VDUP(vector64_b, , poly, p, 8, 8, 0x55);
   VDUP(vector64_b, , poly, p, 16, 4, 0x66);
+  MFLOAT8_ONLY(VDUP(vector64_b, , mfloat, mf, 8, 8, MFLOAT8(0xcc));)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VDUP(vector64_b, , float, f, 16, 4, 2.25);
 #endif
@@ -80,6 +88,7 @@ void exec_vcombine (void)
   TEST_VCOMBINE(uint, u, 64, 1, 2);
   TEST_VCOMBINE(poly, p, 8, 8, 16);
   TEST_VCOMBINE(poly, p, 16, 4, 8);
+  MFLOAT8_ONLY(TEST_VCOMBINE(mfloat, mf, 8, 8, 16);)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VCOMBINE(float, f, 16, 4, 8);
 #endif
@@ -95,6 +104,7 @@ void exec_vcombine (void)
   CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
   CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
   CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 16, PRIx16, expected, "");)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
index c0b9c7a..77d9be2 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x123456789abcdef0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xde, 0xbc, 0x9a,
 					0x78, 0x56, 0x34, 0x12 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xdef0, 0x9abc, 0x5678, 0x1234 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf0, 0xde, 0xbc, 0x9a,
+					   0x78, 0x56, 0x34, 0x12 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xdef0, 0x9abc, 0x5678, 0x1234 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x9abcdef0, 0x12345678 };
 
@@ -39,6 +43,7 @@ FNNAME (INSN_NAME)
   DECL_VAL(val, int, 16, 4);
   DECL_VAL(val, int, 32, 2);
   DECL_VAL(val, int, 64, 1);
+  MFLOAT8_ONLY(DECL_VAL(val, mfloat, 8, 8);)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   DECL_VAL(val, float, 16, 4);
 #endif
@@ -54,6 +59,7 @@ FNNAME (INSN_NAME)
   DECL_VARIABLE(vector_res, int, 16, 4);
   DECL_VARIABLE(vector_res, int, 32, 2);
   DECL_VARIABLE(vector_res, int, 64, 1);
+  MFLOAT8_ONLY(DECL_VARIABLE(vector_res, mfloat, 8, 8);)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   DECL_VARIABLE(vector_res, float, 16, 4);
 #endif
@@ -72,6 +78,7 @@ FNNAME (INSN_NAME)
   VECT_VAR(val, int, 16, 4) = 0x123456789abcdef0LL;
   VECT_VAR(val, int, 32, 2) = 0x123456789abcdef0LL;
   VECT_VAR(val, int, 64, 1) = 0x123456789abcdef0LL;
+  MFLOAT8_ONLY(VECT_VAR(val, mfloat, 8, 8) = 0x123456789abcdef0LL;)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_VAR(val, float, 16, 4) = 0x123456789abcdef0LL;
 #endif
@@ -86,6 +93,7 @@ FNNAME (INSN_NAME)
   TEST_VCREATE(int, s, 8, 8);
   TEST_VCREATE(int, s, 16, 4);
   TEST_VCREATE(int, s, 32, 2);
+  MFLOAT8_ONLY(TEST_VCREATE(mfloat, mf, 8, 8);)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VCREATE(float, f, 16, 4);
 #endif
@@ -108,6 +116,7 @@ FNNAME (INSN_NAME)
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx16, expected, "");)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
index aef4173..26c5489 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
@@ -19,6 +19,10 @@ VECT_VAR_DECL(expected0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
+					    0xf0, 0xf0, 0xf0, 0xf0 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcc00,
 						0xcc00, 0xcc00 };
@@ -50,6 +54,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					  0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0,
 					  0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
+					     0xf0, 0xf0, 0xf0, 0xf0,
+					     0xf0, 0xf0, 0xf0, 0xf0,
+					     0xf0, 0xf0, 0xf0, 0xf0 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xcc00,
 						0xcc00, 0xcc00,
@@ -73,6 +83,10 @@ VECT_VAR_DECL(expected1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
+					    0xf1, 0xf1, 0xf1, 0xf1 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0xcb80, 0xcb80,
 						0xcb80, 0xcb80 };
@@ -104,6 +118,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					  0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
+					     0xf1, 0xf1, 0xf1, 0xf1,
+					     0xf1, 0xf1, 0xf1, 0xf1,
+					     0xf1, 0xf1, 0xf1, 0xf1 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0xcb80, 0xcb80,
 						0xcb80, 0xcb80,
@@ -127,6 +147,10 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
+					    0xf2, 0xf2, 0xf2, 0xf2 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb00, 0xcb00,
 						0xcb00, 0xcb00 };
@@ -158,6 +182,12 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					  0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2,
 					  0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
+					     0xf2, 0xf2, 0xf2, 0xf2,
+					     0xf2, 0xf2, 0xf2, 0xf2,
+					     0xf2, 0xf2, 0xf2, 0xf2 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xcb00, 0xcb00,
 						0xcb00, 0xcb00,
@@ -201,6 +231,7 @@ void exec_vdup_vmov (void)
     TEST_VDUP(, uint, u, 64, 1);
     TEST_VDUP(, poly, p, 8, 8);
     TEST_VDUP(, poly, p, 16, 4);
+    MFLOAT8_ONLY(TEST_VDUP(, mfloat, mf, 8, 8));
 #if defined (FP16_SUPPORTED)
     TEST_VDUP(, float, f, 16, 4);
 #endif
@@ -216,6 +247,7 @@ void exec_vdup_vmov (void)
     TEST_VDUP(q, uint, u, 64, 2);
     TEST_VDUP(q, poly, p, 8, 16);
     TEST_VDUP(q, poly, p, 16, 8);
+    MFLOAT8_ONLY(TEST_VDUP(q, mfloat, mf, 8, 16));
 #if defined (FP16_SUPPORTED)
     TEST_VDUP(q, float, f, 16, 8);
 #endif
@@ -268,6 +300,7 @@ void exec_vdup_vmov (void)
     TEST_VMOV(, uint, u, 64, 1);
     TEST_VMOV(, poly, p, 8, 8);
     TEST_VMOV(, poly, p, 16, 4);
+    MFLOAT8_ONLY(TEST_VMOV(, mfloat, mf, 8, 8));
 #if defined (FP16_SUPPORTED)
     TEST_VMOV(, float, f, 16, 4);
 #endif
@@ -283,6 +316,7 @@ void exec_vdup_vmov (void)
     TEST_VMOV(q, uint, u, 64, 2);
     TEST_VMOV(q, poly, p, 8, 16);
     TEST_VMOV(q, poly, p, 16, 8);
+    MFLOAT8_ONLY(TEST_VMOV(q, mfloat, mf, 8, 16));
 #if defined (FP16_SUPPORTED)
     TEST_VMOV(q, float, f, 16, 8);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
index 5d0dba3..e0f6a86 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf7, 0xf7, 0xf7, 0xf7,
 					0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff3, 0xfff3, 0xfff3, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf6, 0xf6, 0xf6, 0xf6,
+					   0xf6, 0xf6, 0xf6, 0xf6 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xca80, 0xca80,
@@ -47,6 +51,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
 					 0xf5, 0xf5, 0xf5, 0xf5 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xf7, 0xf7, 0xf7, 0xf7,
+					    0xf7, 0xf7, 0xf7, 0xf7,
+					    0xf7, 0xf7, 0xf7, 0xf7,
+					    0xf7, 0xf7, 0xf7, 0xf7 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xca80, 0xca80,
 					       0xca80, 0xca80,
@@ -73,6 +83,7 @@ void exec_vdup_lane (void)
   clean_results ();
 
   TEST_MACRO_64BITS_VARIANTS_2_5(VLOAD, vector, buffer);
+  MFLOAT8_ONLY(VLOAD(vector, buffer, , mfloat, mf, 8, 8);)
 #if defined (FP16_SUPPORTED)
   VLOAD(vector, buffer, , float, f, 16, 4);
 #endif
@@ -89,6 +100,7 @@ void exec_vdup_lane (void)
   TEST_VDUP_LANE(, uint, u, 64, 1, 1, 0);
   TEST_VDUP_LANE(, poly, p, 8, 8, 8, 7);
   TEST_VDUP_LANE(, poly, p, 16, 4, 4, 3);
+  MFLOAT8_ONLY(TEST_VDUP_LANE(, mfloat, mf, 8, 8, 8, 6);)
 #if defined (FP16_SUPPORTED)
   TEST_VDUP_LANE(, float, f, 16, 4, 4, 3);
 #endif
@@ -104,6 +116,7 @@ void exec_vdup_lane (void)
   TEST_VDUP_LANE(q, uint, u, 64, 2, 1, 0);
   TEST_VDUP_LANE(q, poly, p, 8, 16, 8, 5);
   TEST_VDUP_LANE(q, poly, p, 16, 8, 4, 1);
+  MFLOAT8_ONLY(TEST_VDUP_LANE(q, mfloat, mf, 8, 16, 8, 7);)
 #if defined (FP16_SUPPORTED)
   TEST_VDUP_LANE(q, float, f, 16, 8, 4, 3);
 #endif
@@ -134,6 +147,10 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf7, 0xf7, 0xf7, 0xf7,
 					 0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff3, 0xfff3, 0xfff3, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,8) [] = { 0xfb, 0xfb, 0xfb, 0xfb,
+					    0xfb, 0xfb, 0xfb, 0xfb };
+#endif
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xca80, 0xca80,
@@ -165,6 +182,12 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
 					  0xf5, 0xf5, 0xf5, 0xf5 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,16) [] = { 0xfc, 0xfc, 0xfc, 0xfc,
+					     0xfc, 0xfc, 0xfc, 0xfc,
+					     0xfc, 0xfc, 0xfc, 0xfc,
+					     0xfc, 0xfc, 0xfc, 0xfc };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xc880, 0xc880,
 						0xc880, 0xc880,
@@ -188,6 +211,7 @@ VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
   clean_results ();
 
   TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vector, buffer);
+  MFLOAT8_ONLY(VLOAD(vector, buffer, q, mfloat, mf, 8, 16);)
 #if defined (FP16_SUPPORTED)
   VLOAD(vector, buffer, q, float, f, 16, 8);
 #endif
@@ -204,6 +228,7 @@ VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
   TEST_VDUP_LANEQ(, uint, u, 64, 1, 2, 0);
   TEST_VDUP_LANEQ(, poly, p, 8, 8, 16, 7);
   TEST_VDUP_LANEQ(, poly, p, 16, 4, 8, 3);
+  MFLOAT8_ONLY(TEST_VDUP_LANEQ(, mfloat, mf, 8, 8, 16, 11);)
 #if defined (FP16_SUPPORTED)
   TEST_VDUP_LANEQ(, float, f, 16, 4, 8, 3);
 #endif
@@ -219,6 +244,7 @@ VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
   TEST_VDUP_LANEQ(q, uint, u, 64, 2, 2, 0);
   TEST_VDUP_LANEQ(q, poly, p, 8, 16, 16, 5);
   TEST_VDUP_LANEQ(q, poly, p, 16, 8, 8, 1);
+  MFLOAT8_ONLY(TEST_VDUP_LANEQ(q, mfloat, mf, 8, 16, 16, 12);)
 #if defined (FP16_SUPPORTED)
   TEST_VDUP_LANEQ(q, float, f, 16, 8, 8, 7);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
index 908294a..f7da4ee 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf6, 0xf7, 0x55, 0x55,
 					0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff2, 0xfff3, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf5, 0xf6, 0xf7, 0x77,
+					   0x77, 0x77, 0x77, 0x77 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcb00, 0xca80,
 					       0x4b4d, 0x4b4d };
@@ -43,6 +47,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xfc, 0xfd, 0xfe, 0xff,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff6, 0xfff7, 0x66, 0x66,
 					 0x66, 0x66, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xf9, 0xfa, 0xfb, 0xfc,
+					    0xfd, 0xfe, 0xff, 0xaa,
+					    0xaa, 0xaa, 0xaa, 0xaa,
+					    0xaa, 0xaa, 0xaa, 0xaa };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xc880, 0x4b4d,
 					       0x4b4d, 0x4b4d,
@@ -70,6 +80,10 @@ void exec_vext (void)
   clean_results ();
 
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector1, buffer);
+#if MFLOAT8_SUPPORTED
+  VLOAD(vector1, buffer, , mfloat, mf, 8, 8);
+  VLOAD(vector1, buffer, q, mfloat, mf, 8, 16);
+#endif
 #ifdef FP16_SUPPORTED
   VLOAD(vector1, buffer, , float, f, 16, 4);
   VLOAD(vector1, buffer, q, float, f, 16, 8);
@@ -88,6 +102,7 @@ void exec_vext (void)
   VDUP(vector2, , uint, u, 64, 1, 0x88);
   VDUP(vector2, , poly, p, 8, 8, 0x55);
   VDUP(vector2, , poly, p, 16, 4, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, , mfloat, mf, 8, 8, MFLOAT8(0x77)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
 #endif
@@ -103,6 +118,7 @@ void exec_vext (void)
   VDUP(vector2, q, uint, u, 64, 2, 0x88);
   VDUP(vector2, q, poly, p, 8, 16, 0x55);
   VDUP(vector2, q, poly, p, 16, 8, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, q, mfloat, mf, 8, 16, MFLOAT8(0xaa)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, q, float, f, 16, 8, 14.6f);
 #endif
@@ -119,6 +135,7 @@ void exec_vext (void)
   TEST_VEXT(, uint, u, 64, 1, 0);
   TEST_VEXT(, poly, p, 8, 8, 6);
   TEST_VEXT(, poly, p, 16, 4, 2);
+  MFLOAT8_ONLY(TEST_VEXT(, mfloat, mf, 8, 8, 5));
 #if defined (FP16_SUPPORTED)
   TEST_VEXT(, float, f, 16, 4, 2);
 #endif
@@ -134,6 +151,7 @@ void exec_vext (void)
   TEST_VEXT(q, uint, u, 64, 2, 1);
   TEST_VEXT(q, poly, p, 8, 16, 12);
   TEST_VEXT(q, poly, p, 16, 8, 6);
+  MFLOAT8_ONLY(TEST_VEXT(q, mfloat, mf, 8, 16, 9));
 #if defined (FP16_SUPPORTED)
   TEST_VEXT(q, float, f, 16, 8, 7);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
index f3b14ce..e2c9273 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
+					   0xfc, 0xfd, 0xfe, 0xff };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 
@@ -32,6 +36,7 @@ void exec_vget_high (void)
   DECL_VARIABLE_128BITS_VARIANTS(vector128);
 
   TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vector128, buffer);
+  MFLOAT8_ONLY(VLOAD(vector128, buffer, q, mfloat, mf, 8, 16);)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VLOAD(vector128, buffer, q, float, f, 16, 8);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
index 4ed0e46..7044b3c 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					   0xf4, 0xf5, 0xf6, 0xf7 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -45,6 +49,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					    0xf4, 0xf5, 0xf6, 0xf7,
+					    0xf8, 0xf9, 0xfa, 0xfb,
+					    0xfc, 0xfd, 0xfe, 0xff };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
 					   0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
@@ -65,6 +75,10 @@ void exec_vld1 (void)
 
   TEST_MACRO_ALL_VARIANTS_2_5(TEST_VLD1, vector, buffer);
 
+#if MFLOAT8_SUPPORTED
+  TEST_VLD1(vector, buffer, , mfloat, mf, 8, 8);
+  TEST_VLD1(vector, buffer, q, mfloat, mf, 8, 16);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VLD1(vector, buffer, , float, f, 16, 4);
   TEST_VLD1(vector, buffer, q, float, f, 16, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
index 34be214..275cfee 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
@@ -17,6 +17,10 @@ VECT_VAR_DECL(expected0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
+					    0xf0, 0xf0, 0xf0, 0xf0 };
+#endif
 VECT_VAR_DECL(expected0,hfloat,16,4) [] = { 0xcc00, 0xcc00, 0xcc00, 0xcc00 };
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
@@ -45,6 +49,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					  0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0,
 					  0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
+					     0xf0, 0xf0, 0xf0, 0xf0,
+					     0xf0, 0xf0, 0xf0, 0xf0,
+					     0xf0, 0xf0, 0xf0, 0xf0 };
+#endif
 VECT_VAR_DECL(expected0,hfloat,16,8) [] = { 0xcc00, 0xcc00, 0xcc00, 0xcc00,
 					    0xcc00, 0xcc00, 0xcc00, 0xcc00 };
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1800000,
@@ -64,6 +74,10 @@ VECT_VAR_DECL(expected1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
+					    0xf1, 0xf1, 0xf1, 0xf1 };
+#endif
 VECT_VAR_DECL(expected1,hfloat,16,4) [] = { 0xcb80, 0xcb80, 0xcb80, 0xcb80 };
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
@@ -92,6 +106,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					  0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
+					     0xf1, 0xf1, 0xf1, 0xf1,
+					     0xf1, 0xf1, 0xf1, 0xf1,
+					     0xf1, 0xf1, 0xf1, 0xf1 };
+#endif
 VECT_VAR_DECL(expected1,hfloat,16,8) [] = { 0xcb80, 0xcb80, 0xcb80, 0xcb80,
 					    0xcb80, 0xcb80, 0xcb80, 0xcb80 };
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
@@ -111,6 +131,10 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
+					    0xf2, 0xf2, 0xf2, 0xf2 };
+#endif
 VECT_VAR_DECL(expected2,hfloat,16,4) [] = { 0xcb00, 0xcb00, 0xcb00, 0xcb00 };
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1600000, 0xc1600000 };
 VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
@@ -139,6 +163,12 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					  0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2,
 					  0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
+					     0xf2, 0xf2, 0xf2, 0xf2,
+					     0xf2, 0xf2, 0xf2, 0xf2,
+					     0xf2, 0xf2, 0xf2, 0xf2 };
+#endif
 VECT_VAR_DECL(expected2,hfloat,16,8) [] = { 0xcb00, 0xcb00, 0xcb00, 0xcb00,
 					    0xcb00, 0xcb00, 0xcb00, 0xcb00 };
 VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0xc1600000,
@@ -163,6 +193,10 @@ void exec_vld1_dup (void)
 
     TEST_MACRO_ALL_VARIANTS_2_5(TEST_VLD1_DUP, vector, buffer_dup);
 
+#if MFLOAT8_SUPPORTED
+    TEST_VLD1_DUP(vector, buffer_dup, , mfloat, mf, 8, 8);
+    TEST_VLD1_DUP(vector, buffer_dup, q, mfloat, mf, 8, 16);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
     TEST_VLD1_DUP(vector, buffer_dup, , float, f, 16, 4);
     TEST_VLD1_DUP(vector, buffer_dup, q, float, f, 16, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c
index 1f39006..d6f3ce7 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					0xaa, 0xaa, 0xaa, 0xf0 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xfff0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+					   0xaa, 0xf0, 0xaa, 0xaa };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xaaaa };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xaaaaaaaa, 0xc1800000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
@@ -44,6 +48,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					 0xf0, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					 0xaaaa, 0xaaaa, 0xfff0, 0xaaaa };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+					    0xaa, 0xaa, 0xaa, 0xaa,
+					    0xaa, 0xaa, 0xaa, 0xf0,
+					    0xaa, 0xaa, 0xaa, 0xaa };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					   0xaaaa, 0xcc00, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
@@ -75,6 +85,7 @@ void exec_vld1_lane (void)
   ARRAY(buffer_src, uint, 64, 1);
   ARRAY(buffer_src, poly, 8, 8);
   ARRAY(buffer_src, poly, 16, 4);
+  MFLOAT8_ONLY(ARRAY(buffer_src, mfloat, 8, 8));
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   ARRAY(buffer_src, float, 16, 4);
 #endif
@@ -90,6 +101,7 @@ void exec_vld1_lane (void)
   ARRAY(buffer_src, uint, 64, 2);
   ARRAY(buffer_src, poly, 8, 16);
   ARRAY(buffer_src, poly, 16, 8);
+  MFLOAT8_ONLY(ARRAY(buffer_src, mfloat, 8, 16));
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   ARRAY(buffer_src, float, 16, 8);
 #endif
@@ -108,6 +120,7 @@ void exec_vld1_lane (void)
   TEST_VLD1_LANE(, uint, u, 64, 1, 0);
   TEST_VLD1_LANE(, poly, p, 8, 8, 7);
   TEST_VLD1_LANE(, poly, p, 16, 4, 3);
+  MFLOAT8_ONLY(TEST_VLD1_LANE(, mfloat, mf, 8, 8, 5));
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VLD1_LANE(, float, f, 16, 4, 2);
 #endif
@@ -123,6 +136,7 @@ void exec_vld1_lane (void)
   TEST_VLD1_LANE(q, uint, u, 64, 2, 0);
   TEST_VLD1_LANE(q, poly, p, 8, 16, 12);
   TEST_VLD1_LANE(q, poly, p, 16, 8, 6);
+  MFLOAT8_ONLY(TEST_VLD1_LANE(q, mfloat, mf, 8, 16, 11));
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VLD1_LANE(q, float, f, 16, 8, 5);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x2.c
index 0c45a2b..6e56ff1 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x2.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x2.c
@@ -4,6 +4,7 @@
 /* { dg-options "-O3" } */
 
 #include <arm_neon.h>
+#include "arm-neon-ref.h"
 
 extern void abort (void);
 
@@ -16,14 +17,14 @@ test_vld##SUFFIX##_x2 ()			\
   BASE##x##ELTS##x##2##_t vectors;		\
   int i,j;					\
   for (i = 0; i < ELTS * 2; i++)		\
-    data [i] = (BASE##_t) 2*i + 1;		\
+    data [i] = CONVERT (BASE##_t, 2*i + 1);	\
   asm volatile ("" : : : "memory");		\
   vectors = vld1##SUFFIX##_x2 (data);		\
   vst1##SUFFIX (temp, vectors.val[0]);		\
   vst1##SUFFIX (&temp[ELTS], vectors.val[1]);	\
   asm volatile ("" : : : "memory");		\
   for (j = 0; j < ELTS * 2; j++)		\
-    if (temp[j] != data[j])			\
+    if (!BITEQUAL (temp[j], data[j]))		\
       return 1;					\
   return 0;					\
 }
@@ -56,6 +57,8 @@ VARIANT (float32, 4, q_f32)
 
 #ifdef __aarch64__
 #define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (mfloat8, 8, _mf8)			\
+VARIANT (mfloat8, 16, q_mf8)			\
 VARIANT (float64, 1, _f64)			\
 VARIANT (float64, 2, q_f64)
 #else
@@ -65,14 +68,14 @@ VARIANT (float64, 2, q_f64)
 /* Tests of vld1_x2 and vld1q_x2.  */
 VARIANTS (TESTMETH)
 
-#define CHECK(BASE, ELTS, SUFFIX)	\
+#define CHECKS(BASE, ELTS, SUFFIX)	\
   if (test_vld##SUFFIX##_x2 () != 0)	\
     abort ();
 
 int
 main (int argc, char **argv)
 {
-  VARIANTS (CHECK)
+  VARIANTS (CHECKS)
 
   return 0;
 }
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
index 4174dcd..42aeadf 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
@@ -17,7 +17,7 @@ test_vld##SUFFIX##_x3 ()			\
   BASE##x##ELTS##x##3##_t vectors;		\
   int i,j;					\
   for (i = 0; i < ELTS * 3; i++)		\
-    data [i] = (BASE##_t) 3*i;		\
+    data [i] = CONVERT (BASE##_t, 3*i);		\
   asm volatile ("" : : : "memory");		\
   vectors = vld1##SUFFIX##_x3 (data);		\
   vst1##SUFFIX (temp, vectors.val[0]);		\
@@ -25,7 +25,7 @@ test_vld##SUFFIX##_x3 ()			\
   vst1##SUFFIX (&temp[ELTS * 2], vectors.val[2]);	\
   asm volatile ("" : : : "memory");		\
   for (j = 0; j < ELTS * 3; j++)		\
-    if (temp[j] != data[j]) 			\
+    if (!BITEQUAL (temp[j], data[j]))		\
       return 1;					\
   return 0;					\
 }
@@ -58,6 +58,8 @@ VARIANT (float32, 4, q_f32)
 
 #ifdef __aarch64__
 #define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (mfloat8, 8, _mf8)			\
+VARIANT (mfloat8, 16, q_mf8)			\
 VARIANT (float64, 1, _f64)			\
 VARIANT (float64, 2, q_f64)
 #else
@@ -70,7 +72,7 @@ VARIANTS (TESTMETH)
 
 #define CHECKS(BASE, ELTS, SUFFIX)	\
   if (test_vld##SUFFIX##_x3 () != 0)	\
-    fprintf (stderr, "test_vld1##SUFFIX##_x3");
+    fprintf (stderr, "test_vld1##SUFFIX##_x3"), abort ();
 
 int
 main (int argc, char **argv)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
index 17db262..694fda8 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
@@ -18,7 +18,7 @@ test_vld1##SUFFIX##_x4 ()				\
   BASE##x##ELTS##x##4##_t vectors;			\
   int i,j;						\
   for (i = 0; i < ELTS * 4; i++)			\
-    data [i] = (BASE##_t) 4*i;				\
+    data [i] = CONVERT (BASE##_t, 4*i);			\
   asm volatile ("" : : : "memory");			\
   vectors = vld1##SUFFIX##_x4 (data);			\
   vst1##SUFFIX (temp, vectors.val[0]);			\
@@ -27,7 +27,7 @@ test_vld1##SUFFIX##_x4 ()				\
   vst1##SUFFIX (&temp[ELTS * 3], vectors.val[3]);	\
   asm volatile ("" : : : "memory");			\
   for (j = 0; j < ELTS * 4; j++)			\
-    if (temp[j] != data[j])				\
+    if (!BITEQUAL (temp[j], data[j]))			\
       return 1;						\
   return 0;						\
 }
@@ -62,6 +62,8 @@ VARIANT (float32, 4, q_f32)
 
 #ifdef __aarch64__
 #define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (mfloat8, 8, _mf8)			\
+VARIANT (mfloat8, 16, q_mf8)			\
 VARIANT (float64, 1, _f64)			\
 VARIANT (float64, 2, q_f64)
 #else
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
index 8a5fc22..81d7669 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
@@ -18,6 +18,10 @@ VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld2_0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						  0xf4, 0xf5, 0xf6, 0xf7 };
+#endif
 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -42,6 +46,12 @@ VECT_VAR_DECL(expected_vld2_0,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld2_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld2_0,hmfloat,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						   0xf4, 0xf5, 0xf6, 0xf7,
+						   0xf8, 0xf9, 0xfa, 0xfb,
+						   0xfc, 0xfd, 0xfe, 0xff };
+#endif
 VECT_VAR_DECL(expected_vld2_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
 						  0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
@@ -61,6 +71,10 @@ VECT_VAR_DECL(expected_vld2_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld2_1,hmfloat,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
+						  0xfc, 0xfd, 0xfe, 0xff };
+#endif
 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld2_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
@@ -85,6 +99,12 @@ VECT_VAR_DECL(expected_vld2_1,poly,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 						0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld2_1,poly,16,8) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb,
 						0xfffc, 0xfffd, 0xfffe, 0xffff };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld2_1,hmfloat,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
+						   0x4, 0x5, 0x6, 0x7,
+						   0x8, 0x9, 0xa, 0xb,
+						   0xc, 0xd, 0xe, 0xf };
+#endif
 VECT_VAR_DECL(expected_vld2_1,hfloat,16,8) [] = { 0xc800, 0xc700, 0xc600, 0xc500,
 						  0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
@@ -104,6 +124,10 @@ VECT_VAR_DECL(expected_vld3_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						  0xf4, 0xf5, 0xf6, 0xf7 };
+#endif
 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld3_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -128,6 +152,12 @@ VECT_VAR_DECL(expected_vld3_0,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld3_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_0,hmfloat,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						   0xf4, 0xf5, 0xf6, 0xf7,
+						   0xf8, 0xf9, 0xfa, 0xfb,
+						   0xfc, 0xfd, 0xfe, 0xff };
+#endif
 VECT_VAR_DECL(expected_vld3_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
 						  0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
@@ -147,6 +177,10 @@ VECT_VAR_DECL(expected_vld3_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_1,hmfloat,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
+						  0xfc, 0xfd, 0xfe, 0xff };
+#endif
 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld3_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
@@ -171,6 +205,12 @@ VECT_VAR_DECL(expected_vld3_1,poly,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 						0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld3_1,poly,16,8) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb,
 						0xfffc, 0xfffd, 0xfffe, 0xffff };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_1,hmfloat,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
+						   0x4, 0x5, 0x6, 0x7,
+						   0x8, 0x9, 0xa, 0xb,
+						   0xc, 0xd, 0xe, 0xf };
+#endif
 VECT_VAR_DECL(expected_vld3_1,hfloat,16,8) [] = { 0xc800, 0xc700, 0xc600, 0xc500,
 						  0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
@@ -193,6 +233,10 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff8, 0xfff9,
 						0xfffa, 0xfffb };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_2,hmfloat,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
+						  0x4, 0x5, 0x6, 0x7 };
+#endif
 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xc800, 0xc700, 0xc600, 0xc500 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1400000, 0xc1300000 };
 VECT_VAR_DECL(expected_vld3_2,int,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
@@ -217,6 +261,12 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 						0x1c, 0x1d, 0x1e, 0x1f };
 VECT_VAR_DECL(expected_vld3_2,poly,16,8) [] = { 0x0, 0x1, 0x2, 0x3,
 						0x4, 0x5, 0x6, 0x7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_2,hmfloat,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
+						   0x14, 0x15, 0x16, 0x17,
+						   0x18, 0x19, 0x1a, 0x1b,
+						   0x1c, 0x1d, 0x1e, 0x1f };
+#endif
 VECT_VAR_DECL(expected_vld3_2,hfloat,16,8) [] = { 0x0000, 0x3c00, 0x4000, 0x4200,
 						  0x4400, 0x4500, 0x4600, 0x4700 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,4) [] = { 0xc1000000, 0xc0e00000,
@@ -237,6 +287,10 @@ VECT_VAR_DECL(expected_vld4_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						  0xf4, 0xf5, 0xf6, 0xf7 };
+#endif
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
@@ -262,6 +316,12 @@ VECT_VAR_DECL(expected_vld4_0,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld4_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_0,hmfloat,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						   0xf4, 0xf5, 0xf6, 0xf7,
+						   0xf8, 0xf9, 0xfa, 0xfb,
+						   0xfc, 0xfd, 0xfe, 0xff };
+#endif
 VECT_VAR_DECL(expected_vld4_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
 						  0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
@@ -281,6 +341,10 @@ VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_1,hmfloat,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
+						  0xfc, 0xfd, 0xfe, 0xff };
+#endif
 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
@@ -305,6 +369,12 @@ VECT_VAR_DECL(expected_vld4_1,poly,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 						0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld4_1,poly,16,8) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb,
 						0xfffc, 0xfffd, 0xfffe, 0xffff };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_1,hmfloat,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
+						   0x4, 0x5, 0x6, 0x7,
+						   0x8, 0x9, 0xa, 0xb,
+						   0xc, 0xd, 0xe, 0xf };
+#endif
 VECT_VAR_DECL(expected_vld4_1,hfloat,16,8) [] = { 0xc800, 0xc700, 0xc600, 0xc500,
 						  0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
@@ -324,6 +394,10 @@ VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_2,hmfloat,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
+						  0x4, 0x5, 0x6, 0x7 };
+#endif
 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xc800, 0xc700, 0xc600, 0xc500 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1400000, 0xc1300000 };
 VECT_VAR_DECL(expected_vld4_2,int,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
@@ -348,6 +422,12 @@ VECT_VAR_DECL(expected_vld4_2,poly,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 						0x1c, 0x1d, 0x1e, 0x1f };
 VECT_VAR_DECL(expected_vld4_2,poly,16,8) [] = { 0x0, 0x1, 0x2, 0x3,
 						0x4, 0x5, 0x6, 0x7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_2,hmfloat,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
+						   0x14, 0x15, 0x16, 0x17,
+						   0x18, 0x19, 0x1a, 0x1b,
+						   0x1c, 0x1d, 0x1e, 0x1f };
+#endif
 VECT_VAR_DECL(expected_vld4_2,hfloat,16,8) [] = { 0x0000, 0x3c00, 0x4000, 0x4200,
 						  0x4400, 0x4500, 0x4600, 0x4700 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,4) [] = { 0xc1000000, 0xc0e00000,
@@ -367,6 +447,10 @@ VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0x8, 0x9, 0xa, 0xb,
 					       0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfffc, 0xfffd, 0xfffe, 0xffff };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_3,hmfloat,8,8) [] = { 0x8, 0x9, 0xa, 0xb,
+						  0xc, 0xd, 0xe, 0xf };
+#endif
 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1200000, 0xc1100000 };
 VECT_VAR_DECL(expected_vld4_3,int,8,16) [] = { 0x20, 0x21, 0x22, 0x23,
@@ -391,6 +475,12 @@ VECT_VAR_DECL(expected_vld4_3,poly,8,16) [] = { 0x20, 0x21, 0x22, 0x23,
 						0x2c, 0x2d, 0x2e, 0x2f };
 VECT_VAR_DECL(expected_vld4_3,poly,16,8) [] = { 0x8, 0x9, 0xa, 0xb,
 						0xc, 0xd, 0xe, 0xf };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_3,hmfloat,8,16) [] = { 0x20, 0x21, 0x22, 0x23,
+						   0x24, 0x25, 0x26, 0x27,
+						   0x28, 0x29, 0x2a, 0x2b,
+						   0x2c, 0x2d, 0x2e, 0x2f };
+#endif
 VECT_VAR_DECL(expected_vld4_3,hfloat,16,8) [] = { 0x4800, 0x4880, 0x4900, 0x4980,
 						  0x4a00, 0x4a80, 0x4b00, 0x4b80 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,4) [] = { 0xc0800000, 0xc0400000,
@@ -436,6 +526,7 @@ void exec_vldX (void)
   DECL_VLDX(uint, 64, 1, X);			\
   DECL_VLDX(poly, 8, 8, X);			\
   DECL_VLDX(poly, 16, 4, X);			\
+  MFLOAT8_ONLY(DECL_VLDX(mfloat, 8, 8, X));	\
   DECL_VLDX(float, 32, 2, X);			\
   DECL_VLDX(int, 8, 16, X);			\
   DECL_VLDX(int, 16, 8, X);			\
@@ -445,6 +536,7 @@ void exec_vldX (void)
   DECL_VLDX(uint, 32, 4, X);			\
   DECL_VLDX(poly, 8, 16, X);			\
   DECL_VLDX(poly, 16, 8, X);			\
+  MFLOAT8_ONLY(DECL_VLDX(mfloat, 8, 16, X));	\
   DECL_VLDX(float, 32, 4, X)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -467,6 +559,7 @@ void exec_vldX (void)
   TEST_VLDX(, uint, u, 64, 1, X);		\
   TEST_VLDX(, poly, p, 8, 8, X);		\
   TEST_VLDX(, poly, p, 16, 4, X);		\
+  MFLOAT8_ONLY(TEST_VLDX(, mfloat, mf, 8, 8, X)); \
   TEST_VLDX(, float, f, 32, 2, X);		\
   TEST_VLDX(q, int, s, 8, 16, X);		\
   TEST_VLDX(q, int, s, 16, 8, X);		\
@@ -476,6 +569,7 @@ void exec_vldX (void)
   TEST_VLDX(q, uint, u, 32, 4, X);		\
   TEST_VLDX(q, poly, p, 8, 16, X);		\
   TEST_VLDX(q, poly, p, 16, 8, X);		\
+  MFLOAT8_ONLY(TEST_VLDX(q, mfloat, mf, 8, 16, X)); \
   TEST_VLDX(q, float, f, 32, 4, X)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -498,6 +592,7 @@ void exec_vldX (void)
   TEST_EXTRA_CHUNK(uint, 64, 1, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  MFLOAT8_ONLY(TEST_EXTRA_CHUNK(mfloat, 8, 8, X, Y)); \
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(int, 8, 16, X, Y);		\
   TEST_EXTRA_CHUNK(int, 16, 8, X, Y);		\
@@ -507,6 +602,7 @@ void exec_vldX (void)
   TEST_EXTRA_CHUNK(uint, 32, 4, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 16, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
+  MFLOAT8_ONLY(TEST_EXTRA_CHUNK(mfloat, 8, 16, X, Y)); \
   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -530,6 +626,7 @@ void exec_vldX (void)
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
     CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
+    MFLOAT8_ONLY(CHECK_FP(test_name, mfloat, 8, 8, PRIx8, EXPECTED, comment)); \
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
@@ -540,6 +637,7 @@ void exec_vldX (void)
     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);	\
     CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);	\
+    MFLOAT8_ONLY(CHECK_FP(test_name, mfloat, 8, 16, PRIx8, EXPECTED, comment)); \
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -580,6 +678,12 @@ void exec_vldX (void)
   PAD(buffer_vld2_pad, poly, 8, 8);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
   PAD(buffer_vld2_pad, poly, 16, 4);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld2, mfloat, 8, 8, 2);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld2, mfloat, 8, 8, 2),
+		    VECT_ARRAY_VAR(buffer_vld2, int, 8, 8, 2), 8 * 2);
+  PAD(buffer_vld2_pad, mfloat, 8, 8);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
   PAD(buffer_vld2_pad, float, 16, 4);
@@ -607,6 +711,12 @@ void exec_vldX (void)
   PAD(buffer_vld2_pad, poly, 8, 16);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
   PAD(buffer_vld2_pad, poly, 16, 8);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld2, mfloat, 8, 16, 2);
+  PAD(buffer_vld2_pad, mfloat, 8, 16);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld2, mfloat, 8, 16, 2),
+		    VECT_ARRAY_VAR(buffer_vld2, int, 8, 16, 2), 16 * 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
   PAD(buffer_vld2_pad, float, 16, 8);
@@ -635,6 +745,12 @@ void exec_vldX (void)
   PAD(buffer_vld3_pad, poly, 8, 8);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
   PAD(buffer_vld3_pad, poly, 16, 4);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld3, mfloat, 8, 8, 3);
+  PAD(buffer_vld3_pad, mfloat, 8, 8);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld3, mfloat, 8, 8, 3),
+		    VECT_ARRAY_VAR(buffer_vld3, int, 8, 8, 3), 8 * 3);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
   PAD(buffer_vld3_pad, float, 16, 4);
@@ -662,6 +778,12 @@ void exec_vldX (void)
   PAD(buffer_vld3_pad, poly, 8, 16);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
   PAD(buffer_vld3_pad, poly, 16, 8);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld3, mfloat, 8, 16, 3);
+  PAD(buffer_vld3_pad, mfloat, 8, 16);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld3, mfloat, 8, 16, 3),
+		    VECT_ARRAY_VAR(buffer_vld3, int, 8, 16, 3), 16 * 3);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
   PAD(buffer_vld3_pad, float, 16, 8);
@@ -690,6 +812,12 @@ void exec_vldX (void)
   PAD(buffer_vld4_pad, poly, 8, 8);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
   PAD(buffer_vld4_pad, poly, 16, 4);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld4, mfloat, 8, 8, 4);
+  PAD(buffer_vld4_pad, mfloat, 8, 8);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld4, mfloat, 8, 8, 4),
+		    VECT_ARRAY_VAR(buffer_vld4, int, 8, 8, 4), 8 * 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
   PAD(buffer_vld4_pad, float, 16, 4);
@@ -717,6 +845,12 @@ void exec_vldX (void)
   PAD(buffer_vld4_pad, poly, 8, 16);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
   PAD(buffer_vld4_pad, poly, 16, 8);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld4, mfloat, 8, 16, 4);
+  PAD(buffer_vld4_pad, mfloat, 8, 16);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld4, mfloat, 8, 16, 4),
+		    VECT_ARRAY_VAR(buffer_vld4, int, 8, 16, 4), 16 * 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
   PAD(buffer_vld4_pad, float, 16, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
index 903d306..76b720ee 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
@@ -18,6 +18,10 @@ VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
 					0xf0, 0xf1, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld2_0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
+						  0xf0, 0xf1, 0xf0, 0xf1 };
+#endif
 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = {0xcc00, 0xcb80, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -36,6 +40,10 @@ VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
 					       0xf0, 0xf1, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff0, 0xfff1,
 						0xfff0, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld2_1,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
+						  0xf0, 0xf1, 0xf0, 0xf1 };
+#endif
 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -56,6 +64,10 @@ VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
 					       0xf1, 0xf2, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1,
 						0xfff2, 0xfff0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
+						  0xf1, 0xf2, 0xf0, 0xf1 };
+#endif
 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xcc00 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -76,6 +88,10 @@ VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
 					       0xf0, 0xf1, 0xf2, 0xf0 };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff1, 0xfff2,
 						0xfff0, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_1,hmfloat,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
+						  0xf0, 0xf1, 0xf2, 0xf0 };
+#endif
 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xcb80, 0xcb00, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1800000 };
 
@@ -96,6 +112,10 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
 					       0xf2, 0xf0, 0xf1, 0xf2 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff2, 0xfff0,
 						0xfff1, 0xfff2 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_2,hmfloat,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
+						  0xf2, 0xf0, 0xf1, 0xf2 };
+#endif
 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xcc00, 0xcb80, 0xcb00 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1700000, 0xc1600000 };
 
@@ -114,6 +134,10 @@ VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						  0xf0, 0xf1, 0xf2, 0xf3 };
+#endif
 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -131,6 +155,10 @@ VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_1,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						  0xf0, 0xf1, 0xf2, 0xf3 };
+#endif
 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 
@@ -148,6 +176,10 @@ VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_2,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						  0xf0, 0xf1, 0xf2, 0xf3 };
+#endif
 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -165,6 +197,10 @@ VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_3,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						  0xf0, 0xf1, 0xf2, 0xf3 };
+#endif
 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 
@@ -208,6 +244,7 @@ void exec_vldX_dup (void)
   DECL_VLDX_DUP(uint, 64, 1, X);		\
   DECL_VLDX_DUP(poly, 8, 8, X);			\
   DECL_VLDX_DUP(poly, 16, 4, X);		\
+  MFLOAT8_ONLY(DECL_VLDX_DUP(mfloat, 8, 8, X));	\
   DECL_VLDX_DUP(float, 32, 2, X)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -229,6 +266,7 @@ void exec_vldX_dup (void)
   TEST_VLDX_DUP(, uint, u, 64, 1, X);		\
   TEST_VLDX_DUP(, poly, p, 8, 8, X);		\
   TEST_VLDX_DUP(, poly, p, 16, 4, X);		\
+  MFLOAT8_ONLY(TEST_VLDX_DUP(, mfloat, mf, 8, 8, X)); \
   TEST_VLDX_DUP(, float, f, 32, 2, X)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -250,6 +288,7 @@ void exec_vldX_dup (void)
   TEST_EXTRA_CHUNK(uint, 64, 1, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  MFLOAT8_ONLY(TEST_EXTRA_CHUNK(mfloat, 8, 8, X, Y)); \
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -272,6 +311,7 @@ void exec_vldX_dup (void)
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
     CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
+    MFLOAT8_ONLY(CHECK_FP(test_name, mfloat, 8, 8, PRIx8, EXPECTED, comment)); \
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -313,6 +353,12 @@ void exec_vldX_dup (void)
   PAD(buffer_vld2_pad, poly, 8, 8);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
   PAD(buffer_vld2_pad, poly, 16, 4);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld2, mfloat, 8, 8, 2);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld2, mfloat, 8, 8, 2),
+		    VECT_ARRAY_VAR(buffer_vld2, int, 8, 8, 2), 8 * 2);
+  PAD(buffer_vld2_pad, mfloat, 8, 8);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
   PAD(buffer_vld2_pad, float, 16, 4);
@@ -340,6 +386,12 @@ void exec_vldX_dup (void)
   PAD(buffer_vld2_pad, poly, 8, 16);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
   PAD(buffer_vld2_pad, poly, 16, 8);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld2, mfloat, 8, 16, 2);
+  PAD(buffer_vld2_pad, mfloat, 8, 16);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld2, mfloat, 8, 16, 2),
+		    VECT_ARRAY_VAR(buffer_vld2, int, 8, 16, 2), 16 * 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
   PAD(buffer_vld2_pad, float, 16, 8);
@@ -368,6 +420,12 @@ void exec_vldX_dup (void)
   PAD(buffer_vld3_pad, poly, 8, 8);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
   PAD(buffer_vld3_pad, poly, 16, 4);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld3, mfloat, 8, 8, 3);
+  PAD(buffer_vld3_pad, mfloat, 8, 8);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld3, mfloat, 8, 8, 3),
+		    VECT_ARRAY_VAR(buffer_vld3, int, 8, 8, 3), 8 * 3);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
   PAD(buffer_vld3_pad, float, 16, 4);
@@ -395,6 +453,12 @@ void exec_vldX_dup (void)
   PAD(buffer_vld3_pad, poly, 8, 16);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
   PAD(buffer_vld3_pad, poly, 16, 8);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld3, mfloat, 8, 16, 3);
+  PAD(buffer_vld3_pad, mfloat, 8, 16);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld3, mfloat, 8, 16, 3),
+		    VECT_ARRAY_VAR(buffer_vld3, int, 8, 16, 3), 16 * 3);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
   PAD(buffer_vld3_pad, float, 16, 8);
@@ -423,6 +487,12 @@ void exec_vldX_dup (void)
   PAD(buffer_vld4_pad, poly, 8, 8);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
   PAD(buffer_vld4_pad, poly, 16, 4);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld4, mfloat, 8, 8, 4);
+  PAD(buffer_vld4_pad, mfloat, 8, 8);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld4, mfloat, 8, 8, 4),
+		    VECT_ARRAY_VAR(buffer_vld4, int, 8, 8, 4), 8 * 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
   PAD(buffer_vld4_pad, float, 16, 4);
@@ -450,6 +520,12 @@ void exec_vldX_dup (void)
   PAD(buffer_vld4_pad, poly, 8, 16);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
   PAD(buffer_vld4_pad, poly, 16, 8);
+#if MFLOAT8_SUPPORTED
+  VECT_ARRAY(buffer_vld4, mfloat, 8, 16, 4);
+  PAD(buffer_vld4_pad, mfloat, 8, 16);
+  __builtin_memcpy (VECT_ARRAY_VAR(buffer_vld4, mfloat, 8, 16, 4),
+		    VECT_ARRAY_VAR(buffer_vld4, int, 8, 16, 4), 16 * 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
   PAD(buffer_vld4_pad, float, 16, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
index 9651b70..dfda634 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
@@ -18,6 +18,10 @@ VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld2_0,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+						  0xaa, 0xaa, 0xaa, 0xaa };
+#endif
 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -47,6 +51,10 @@ VECT_VAR_DECL(expected_vld2_1,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld2_1,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+						  0xaa, 0xaa, 0xf0, 0xf1 };
+#endif
 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld2_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -76,6 +84,10 @@ VECT_VAR_DECL(expected_vld3_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_0,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+						  0xaa, 0xaa, 0xaa, 0xaa };
+#endif
 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld3_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -105,6 +117,10 @@ VECT_VAR_DECL(expected_vld3_1,uint,32,2) [] = { 0xaaaaaaaa, 0xfffffff0 };
 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xf0, 0xf1, 0xf2, 0xaa };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_1,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+						  0xaa, 0xaa, 0xaa, 0xaa };
+#endif
 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -134,6 +150,10 @@ VECT_VAR_DECL(expected_vld3_2,uint,32,2) [] = { 0xfffffff1, 0xfffffff2 };
 VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xaaaa, 0xfff0, 0xfff1, 0xfff2 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld3_2,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+						  0xaa, 0xf0, 0xf1, 0xf2 };
+#endif
 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
@@ -164,6 +184,10 @@ VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_0,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+						  0xaa, 0xaa, 0xaa, 0xaa };
+#endif
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld4_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
@@ -192,6 +216,10 @@ VECT_VAR_DECL(expected_vld4_1,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_1,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+						  0xaa, 0xaa, 0xaa, 0xaa };
+#endif
 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -221,6 +249,10 @@ VECT_VAR_DECL(expected_vld4_2,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_2,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+						  0xaa, 0xaa, 0xaa, 0xaa };
+#endif
 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -250,6 +282,10 @@ VECT_VAR_DECL(expected_vld4_3,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vld4_3,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+						  0xf0, 0xf1, 0xf2, 0xf3 };
+#endif
 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_3,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
@@ -279,6 +315,9 @@ VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer_vld2_lane, mfloat, 8, 2)[2];
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
 #endif
@@ -295,6 +334,9 @@ VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer_vld3_lane, mfloat, 8, 3)[3];
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
 #endif
@@ -311,6 +353,9 @@ VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer_vld4_lane, mfloat, 8, 4)[4];
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
 #endif
@@ -371,6 +416,7 @@ void exec_vldX_lane (void)
   DECL_VLDX_LANE(uint, 16, 8, X);		\
   DECL_VLDX_LANE(uint, 32, 4, X);		\
   DECL_VLDX_LANE(poly, 16, 8, X);		\
+  MFLOAT8_ONLY(DECL_VLDX_LANE(mfloat, 8, 8, X)); \
   DECL_VLDX_LANE(float, 32, 2, X);		\
   DECL_VLDX_LANE(float, 32, 4, X)
 
@@ -384,9 +430,9 @@ void exec_vldX_lane (void)
 #endif
 
   /* Add some padding to try to catch out of bound accesses.  */
-#define ARRAY1(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[1]={42}
+#define ARRAY1(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[1]={CONVERT(T##W##_t,42)}
 #define DUMMY_ARRAY(V, T, W, N, L) \
-  VECT_VAR_DECL(V,T,W,N)[N*L]={0}; \
+  VECT_VAR_DECL(V,T,W,N)[N*L]={}; \
   ARRAY1(V##_pad,T,W,N)
 
   /* Use the same lanes regardless of the size of the array (X), for
@@ -405,6 +451,7 @@ void exec_vldX_lane (void)
   TEST_VLDX_LANE(q, uint, u, 16, 8, X, 5);	\
   TEST_VLDX_LANE(q, uint, u, 32, 4, X, 0);	\
   TEST_VLDX_LANE(q, poly, p, 16, 8, X, 5);	\
+  MFLOAT8_ONLY(TEST_VLDX_LANE(, mfloat, mf, 8, 8, X, 7)); \
   TEST_VLDX_LANE(, float, f, 32, 2, X, 0);	\
   TEST_VLDX_LANE(q, float, f, 32, 4, X, 2)
 
@@ -431,6 +478,7 @@ void exec_vldX_lane (void)
   TEST_EXTRA_CHUNK(uint, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(uint, 32, 4, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
+  MFLOAT8_ONLY(TEST_EXTRA_CHUNK(mfloat, 8, 8, X, Y)); \
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
 
@@ -453,6 +501,7 @@ void exec_vldX_lane (void)
     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
     CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
+    MFLOAT8_ONLY(CHECK_FP(test_name, mfloat, 8, 8, PRIx8, EXPECTED, comment)); \
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
     CHECK(test_name, int, 16, 8, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, int, 32, 4, PRIx32, EXPECTED, comment);		\
@@ -475,6 +524,15 @@ void exec_vldX_lane (void)
   }
 #endif
 
+#if MFLOAT8_SUPPORTED
+  __builtin_memcpy (VECT_VAR(buffer_vld2_lane, mfloat, 8, 2),
+		    VECT_VAR(buffer_vld2_lane, int, 8, 2), 2);
+  __builtin_memcpy (VECT_VAR(buffer_vld3_lane, mfloat, 8, 3),
+		    VECT_VAR(buffer_vld3_lane, int, 8, 3), 3);
+  __builtin_memcpy (VECT_VAR(buffer_vld4_lane, mfloat, 8, 4),
+		    VECT_VAR(buffer_vld4_lane, int, 8, 4), 4);
+#endif
+
   /* Declare the temporary buffers / variables.  */
   DECL_ALL_VLDX_LANE(2);
   DECL_ALL_VLDX_LANE(3);
@@ -494,6 +552,9 @@ void exec_vldX_lane (void)
   DUMMY_ARRAY(buffer_src, uint, 16, 8, 4);
   DUMMY_ARRAY(buffer_src, uint, 32, 4, 4);
   DUMMY_ARRAY(buffer_src, poly, 16, 8, 4);
+#if MFLOAT8_SUPPORTED
+  DUMMY_ARRAY(buffer_src, mfloat, 8, 8, 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
   DUMMY_ARRAY(buffer_src, float, 16, 8, 4);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrev.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrev.c
index 5215538..b1c57cf 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrev.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrev.c
@@ -21,6 +21,10 @@ VECT_VAR_DECL(expected_vrev16,poly,8,16) [] = { 0xf1, 0xf0, 0xf3, 0xf2,
 						0xf5, 0xf4, 0xf7, 0xf6,
 						0xf9, 0xf8, 0xfb, 0xfa,
 						0xfd, 0xfc, 0xff, 0xfe };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vrev16,hmfloat,8,8) [] = { 0xf1, 0xf0, 0xf3, 0xf2,
+						  0xf5, 0xf4, 0xf7, 0xf6 };
+#endif
 
 /* Expected results for vrev32.  */
 VECT_VAR_DECL(expected_vrev32,int,8,8) [] = { 0xf3, 0xf2, 0xf1, 0xf0,
@@ -32,6 +36,10 @@ VECT_VAR_DECL(expected_vrev32,uint,16,4) [] = { 0xfff1, 0xfff0, 0xfff3, 0xfff2 }
 VECT_VAR_DECL(expected_vrev32,poly,8,8) [] = { 0xf3, 0xf2, 0xf1, 0xf0,
 					       0xf7, 0xf6, 0xf5, 0xf4 };
 VECT_VAR_DECL(expected_vrev32,poly,16,4) [] = { 0xfff1, 0xfff0, 0xfff3, 0xfff2 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vrev32,hmfloat,8,8) [] = { 0xf3, 0xf2, 0xf1, 0xf0,
+						  0xf7, 0xf6, 0xf5, 0xf4 };
+#endif
 VECT_VAR_DECL(expected_vrev32,int,8,16) [] = { 0xf3, 0xf2, 0xf1, 0xf0,
 					       0xf7, 0xf6, 0xf5, 0xf4,
 					       0xfb, 0xfa, 0xf9, 0xf8,
@@ -50,6 +58,12 @@ VECT_VAR_DECL(expected_vrev32,poly,8,16) [] = { 0xf3, 0xf2, 0xf1, 0xf0,
 						0xff, 0xfe, 0xfd, 0xfc };
 VECT_VAR_DECL(expected_vrev32,poly,16,8) [] = { 0xfff1, 0xfff0, 0xfff3, 0xfff2,
 						0xfff5, 0xfff4, 0xfff7, 0xfff6 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vrev32,hmfloat,8,16) [] = { 0xf3, 0xf2, 0xf1, 0xf0,
+						   0xf7, 0xf6, 0xf5, 0xf4,
+						   0xfb, 0xfa, 0xf9, 0xf8,
+						   0xff, 0xfe, 0xfd, 0xfc };
+#endif
 
 /* Expected results for vrev64.  */
 VECT_VAR_DECL(expected_vrev64,int,8,8) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
@@ -63,6 +77,10 @@ VECT_VAR_DECL(expected_vrev64,uint,32,2) [] = { 0xfffffff1, 0xfffffff0 };
 VECT_VAR_DECL(expected_vrev64,poly,8,8) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
 					       0xf3, 0xf2, 0xf1, 0xf0 };
 VECT_VAR_DECL(expected_vrev64,poly,16,4) [] = { 0xfff3, 0xfff2, 0xfff1, 0xfff0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vrev64,hmfloat,8,8) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
+						  0xf3, 0xf2, 0xf1, 0xf0 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected_vrev64, hfloat, 16, 4) [] = { 0xca80, 0xcb00,
 						      0xcb80, 0xcc00 };
@@ -90,6 +108,12 @@ VECT_VAR_DECL(expected_vrev64,poly,8,16) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
 						0xfb, 0xfa, 0xf9, 0xf8 };
 VECT_VAR_DECL(expected_vrev64,poly,16,8) [] = { 0xfff3, 0xfff2, 0xfff1, 0xfff0,
 						0xfff7, 0xfff6, 0xfff5, 0xfff4 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vrev64,hmfloat,8,16) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
+						   0xf3, 0xf2, 0xf1, 0xf0,
+						   0xff, 0xfe, 0xfd, 0xfc,
+						   0xfb, 0xfa, 0xf9, 0xf8 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected_vrev64, hfloat, 16, 8) [] = { 0xca80, 0xcb00,
 						      0xcb80, 0xcc00,
@@ -114,6 +138,10 @@ void exec_vrev (void)
 
   /* Initialize input "vector" from "buffer".  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if MFLOAT8_SUPPORTED
+  VLOAD (vector, buffer, , mfloat, mf, 8, 8);
+  VLOAD (vector, buffer, q, mfloat, mf, 8, 16);
+#endif
 #if defined (FP16_SUPPORTED)
   VLOAD (vector, buffer, , float, f, 16, 4);
   VLOAD (vector, buffer, q, float, f, 16, 8);
@@ -129,6 +157,7 @@ void exec_vrev (void)
   TEST_VREV(q, int, s, 8, 16, 16);
   TEST_VREV(q, uint, u, 8, 16, 16);
   TEST_VREV(q, poly, p, 8, 16, 16);
+  MFLOAT8_ONLY(TEST_VREV(, mfloat, mf, 8, 8, 16));
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vrev16, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev16, "");
@@ -136,6 +165,7 @@ void exec_vrev (void)
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev16, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev16, "");
   CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev16, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vrev16, ""));
 
 #undef TEST_MSG
 #define TEST_MSG "VREV32"
@@ -145,12 +175,14 @@ void exec_vrev (void)
   TEST_VREV(, uint, u, 16, 4, 32);
   TEST_VREV(, poly, p, 8, 8, 32);
   TEST_VREV(, poly, p, 16, 4, 32);
+  MFLOAT8_ONLY(TEST_VREV(, mfloat, mf, 8, 8, 32));
   TEST_VREV(q, int, s, 8, 16, 32);
   TEST_VREV(q, int, s, 16, 8, 32);
   TEST_VREV(q, uint, u, 8, 16, 32);
   TEST_VREV(q, uint, u, 16, 8, 32);
   TEST_VREV(q, poly, p, 8, 16, 32);
   TEST_VREV(q, poly, p, 16, 8, 32);
+  MFLOAT8_ONLY(TEST_VREV(q, mfloat, mf, 8, 16, 32));
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vrev32, "");
   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_vrev32, "");
@@ -158,12 +190,14 @@ void exec_vrev (void)
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_vrev32, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev32, "");
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev32, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vrev32, ""));
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev32, "");
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_vrev32, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev32, "");
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_vrev32, "");
   CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev32, "");
   CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev32, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 16, PRIx8, expected_vrev32, ""));
 
 #undef TEST_MSG
 #define TEST_MSG "VREV64"
@@ -175,6 +209,7 @@ void exec_vrev (void)
   TEST_VREV(, uint, u, 32, 2, 64);
   TEST_VREV(, poly, p, 8, 8, 64);
   TEST_VREV(, poly, p, 16, 4, 64);
+  MFLOAT8_ONLY(TEST_VREV(, mfloat, mf, 8, 8, 64));
   TEST_VREV(q, int, s, 8, 16, 64);
   TEST_VREV(q, int, s, 16, 8, 64);
   TEST_VREV(q, int, s, 32, 4, 64);
@@ -183,6 +218,7 @@ void exec_vrev (void)
   TEST_VREV(q, uint, u, 32, 4, 64);
   TEST_VREV(q, poly, p, 8, 16, 64);
   TEST_VREV(q, poly, p, 16, 8, 64);
+  MFLOAT8_ONLY(TEST_VREV(q, mfloat, mf, 8, 16, 64));
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vrev64, "");
   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_vrev64, "");
@@ -192,6 +228,7 @@ void exec_vrev (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_vrev64, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev64, "");
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev64, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vrev64, ""));
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev64, "");
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_vrev64, "");
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_vrev64, "");
@@ -200,6 +237,7 @@ void exec_vrev (void)
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_vrev64, "");
   CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev64, "");
   CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev64, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 16, PRIx8, expected_vrev64, ""));
 
 #if defined (FP16_SUPPORTED)
   TEST_VREV (, float, f, 16, 4, 64);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vset_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vset_lane.c
index e0499df..dc7d6ec 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vset_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vset_lane.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x88 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0x55, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff1, 0x66, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					   0xbb, 0xf5, 0xf6, 0xf7 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0x4840, 0xca80 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x4204cccd };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -42,6 +46,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xfc, 0xfd, 0xdd, 0xff };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0xfff4, 0xfff5, 0xee, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					    0xf4, 0xf5, 0xf6, 0xf7,
+					    0xf8, 0xf9, 0xa0, 0xfb,
+					    0xfc, 0xfd, 0xfe, 0xff };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
 					   0xca00, 0x4480, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
@@ -64,6 +74,10 @@ void exec_vset_lane (void)
 
   /* Initialize input "vector" from "buffer".  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if MFLOAT8_SUPPORTED
+  VLOAD (vector, buffer, , mfloat, mf, 8, 8);
+  VLOAD (vector, buffer, q, mfloat, mf, 8, 16);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VLOAD(vector, buffer, , float, f, 16, 4);
   VLOAD(vector, buffer, q, float, f, 16, 8);
@@ -82,6 +96,7 @@ void exec_vset_lane (void)
   TEST_VSET_LANE(, uint, u, 64, 1, 0x88, 0);
   TEST_VSET_LANE(, poly, p, 8, 8, 0x55, 6);
   TEST_VSET_LANE(, poly, p, 16, 4, 0x66, 2);
+  MFLOAT8_ONLY(TEST_VSET_LANE(, mfloat, mf, 8, 8, MFLOAT8(0xbb), 4));
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VSET_LANE(, float, f, 16, 4, 8.5f, 2);
 #endif
@@ -97,6 +112,7 @@ void exec_vset_lane (void)
   TEST_VSET_LANE(q, uint, u, 64, 2, 0x11, 1);
   TEST_VSET_LANE(q, poly, p, 8, 16, 0xDD, 14);
   TEST_VSET_LANE(q, poly, p, 16, 8, 0xEE, 6);
+  MFLOAT8_ONLY(TEST_VSET_LANE(q, mfloat, mf, 8, 16, MFLOAT8(0xa0), 10));
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VSET_LANE(q, float, f, 16, 8, 4.5f, 5);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshuffle.inc b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshuffle.inc
index 9976488..42922b6 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshuffle.inc
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshuffle.inc
@@ -41,6 +41,7 @@ void FNNAME (INSN_NAME) (void)
   DECL_VSHUFFLE(uint, 32, 2);				\
   DECL_VSHUFFLE(poly, 8, 8);				\
   DECL_VSHUFFLE(poly, 16, 4);				\
+  MFLOAT8_ONLY(DECL_VSHUFFLE(mfloat, 8, 8));		\
   DECL_VSHUFFLE(float, 32, 2);				\
   DECL_VSHUFFLE(int, 8, 16);				\
   DECL_VSHUFFLE(int, 16, 8);				\
@@ -50,6 +51,7 @@ void FNNAME (INSN_NAME) (void)
   DECL_VSHUFFLE(uint, 32, 4);				\
   DECL_VSHUFFLE(poly, 8, 16);				\
   DECL_VSHUFFLE(poly, 16, 8);				\
+  MFLOAT8_ONLY(DECL_VSHUFFLE(mfloat, 8, 16));		\
   DECL_VSHUFFLE(float, 32, 4)
 
   DECL_ALL_VSHUFFLE();
@@ -60,6 +62,10 @@ void FNNAME (INSN_NAME) (void)
 
   /* Initialize input "vector" from "buffer".  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector1, buffer);
+#if MFLOAT8_SUPPORTED
+  VLOAD (vector1, buffer, , mfloat, mf, 8, 8);
+  VLOAD (vector1, buffer, q, mfloat, mf, 8, 16);
+#endif
 #if defined (FP16_SUPPORTED)
   VLOAD (vector1, buffer, , float, f, 16, 4);
   VLOAD (vector1, buffer, q, float, f, 16, 8);
@@ -76,6 +82,7 @@ void FNNAME (INSN_NAME) (void)
   VDUP(vector2, , uint, u, 32, 2, 0x77);
   VDUP(vector2, , poly, p, 8, 8, 0x55);
   VDUP(vector2, , poly, p, 16, 4, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, , mfloat, mf, 8, 8, MFLOAT8(0xaa)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
 #endif
@@ -89,6 +96,7 @@ void FNNAME (INSN_NAME) (void)
   VDUP(vector2, q, uint, u, 32, 4, 0x77);
   VDUP(vector2, q, poly, p, 8, 16, 0x55);
   VDUP(vector2, q, poly, p, 16, 8, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, q, mfloat, mf, 8, 16, MFLOAT8(0xbc)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, q, float, f, 16, 8, 14.6f);
 #endif
@@ -103,6 +111,7 @@ void FNNAME (INSN_NAME) (void)
   TEST_VSHUFFLE(INSN, , uint, u, 32, 2);		\
   TEST_VSHUFFLE(INSN, , poly, p, 8, 8);			\
   TEST_VSHUFFLE(INSN, , poly, p, 16, 4);		\
+  MFLOAT8_ONLY(TEST_VSHUFFLE(INSN, , mfloat, mf, 8, 8)); \
   TEST_VSHUFFLE(INSN, , float, f, 32, 2);		\
   TEST_VSHUFFLE(INSN, q, int, s, 8, 16);		\
   TEST_VSHUFFLE(INSN, q, int, s, 16, 8);		\
@@ -112,6 +121,7 @@ void FNNAME (INSN_NAME) (void)
   TEST_VSHUFFLE(INSN, q, uint, u, 32, 4);		\
   TEST_VSHUFFLE(INSN, q, poly, p, 8, 16);		\
   TEST_VSHUFFLE(INSN, q, poly, p, 16, 8);		\
+  MFLOAT8_ONLY(TEST_VSHUFFLE(INSN, q, mfloat, mf, 8, 16)); \
   TEST_VSHUFFLE(INSN, q, float, f, 32, 4)
 
 #define TEST_VSHUFFLE_FP16(INSN)		\
@@ -127,6 +137,7 @@ void FNNAME (INSN_NAME) (void)
   TEST_EXTRA_CHUNK(uint, 32, 2, 1);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, 1);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, 1);		\
+  MFLOAT8_ONLY(TEST_EXTRA_CHUNK(mfloat, 8, 8, 1)); \
   TEST_EXTRA_CHUNK(float, 32, 2, 1);		\
   TEST_EXTRA_CHUNK(int, 8, 16, 1);		\
   TEST_EXTRA_CHUNK(int, 16, 8, 1);		\
@@ -136,6 +147,7 @@ void FNNAME (INSN_NAME) (void)
   TEST_EXTRA_CHUNK(uint, 32, 4, 1);		\
   TEST_EXTRA_CHUNK(poly, 8, 16, 1);		\
   TEST_EXTRA_CHUNK(poly, 16, 8, 1);		\
+  MFLOAT8_ONLY(TEST_EXTRA_CHUNK(mfloat, 8, 16, 1)); \
   TEST_EXTRA_CHUNK(float, 32, 4, 1)
 
   /* vshuffle support all vector types except [u]int64x1 and
@@ -150,6 +162,7 @@ void FNNAME (INSN_NAME) (void)
     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    MFLOAT8_ONLY(CHECK_FP(test_name, mfloat, 8, 8, PRIx8, EXPECTED, comment)); \
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
@@ -160,6 +173,7 @@ void FNNAME (INSN_NAME) (void)
     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
     CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
+    MFLOAT8_ONLY(CHECK_FP(test_name, mfloat, 8, 16, PRIx8, EXPECTED, comment)); \
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment);	\
   }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c
index 825d07d..f26c467 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf6, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff2, 0x3333, 0x3333, 0x3333 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf2, 0x33, 0x33, 0x33,
+					   0x33, 0x33, 0x33, 0x33 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcb80, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xff, 0x33, 0x33, 0x33,
@@ -43,6 +47,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xfa, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff4, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xfe, 0x33, 0x33, 0x33,
+					    0x33, 0x33, 0x33, 0x33,
+					    0x33, 0x33, 0x33, 0x33,
+					    0x33, 0x33, 0x33, 0x33 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xc900, 0x3333, 0x3333, 0x3333,
 					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1700000, 0x33333333,
@@ -72,6 +82,7 @@ void exec_vst1_lane (void)
   TEST_VST1_LANE(, uint, u, 64, 1, 0);
   TEST_VST1_LANE(, poly, p, 8, 8, 6);
   TEST_VST1_LANE(, poly, p, 16, 4, 2);
+  MFLOAT8_ONLY(TEST_VST1_LANE(, mfloat, mf, 8, 8, 2));
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VST1_LANE(, float, f, 16, 4, 1);
 #endif
@@ -87,6 +98,7 @@ void exec_vst1_lane (void)
   TEST_VST1_LANE(q, uint, u, 64, 2, 0);
   TEST_VST1_LANE(q, poly, p, 8, 16, 10);
   TEST_VST1_LANE(q, poly, p, 16, 8, 4);
+  MFLOAT8_ONLY(TEST_VST1_LANE(q, mfloat, mf, 8, 16, 14));
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VST1_LANE(q, float, f, 16, 8, 6);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
index 6d20a46..69be40a 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
@@ -17,14 +17,14 @@ test_vst1##SUFFIX##_x2 ()			\
   BASE##x##ELTS##x##2##_t vectors;		\
   int i,j;					\
   for (i = 0; i < ELTS * 2; i++)		\
-    data [i] = (BASE##_t) 2*i;		\
+    data [i] = CONVERT (BASE##_t, 2*i);		\
   asm volatile ("" : : : "memory");		\
   vectors.val[0] = vld1##SUFFIX (data);		\
   vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
   vst1##SUFFIX##_x2 (temp, vectors);		\
   asm volatile ("" : : : "memory");		\
   for (j = 0; j < ELTS * 2; j++)		\
-    if (temp[j] != data[j])			\
+    if (!BITEQUAL (temp[j], data[j]))		\
       return 1;					\
   return 0;					\
 }
@@ -57,6 +57,8 @@ VARIANT (float32, 4, q_f32)
 
 #ifdef __aarch64__
 #define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (mfloat8, 8, _mf8)			\
+VARIANT (mfloat8, 16, q_mf8)			\
 VARIANT (float64, 1, _f64)			\
 VARIANT (float64, 2, q_f64)
 #else
@@ -68,7 +70,7 @@ VARIANTS (TESTMETH)
 
 #define CHECKS(BASE, ELTS, SUFFIX)	\
   if (test_vst1##SUFFIX##_x2 () != 0)	\
-    fprintf (stderr, "test_vst1##SUFFIX##_x2");
+    fprintf (stderr, "test_vst1##SUFFIX##_x2"), __builtin_abort ();
 
 int
 main (int argc, char **argv)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
index 87eae4d..4d42bcc 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
@@ -17,7 +17,7 @@ test_vst1##SUFFIX##_x3 ()			\
   BASE##x##ELTS##x##3##_t vectors;		\
   int i,j;					\
   for (i = 0; i < ELTS * 3; i++)		\
-    data [i] = (BASE##_t) 3*i;		\
+    data [i] = CONVERT (BASE##_t, 3*i);		\
   asm volatile ("" : : : "memory");		\
   vectors.val[0] = vld1##SUFFIX (data);		\
   vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
@@ -25,7 +25,7 @@ test_vst1##SUFFIX##_x3 ()			\
   vst1##SUFFIX##_x3 (temp, vectors);		\
   asm volatile ("" : : : "memory");		\
   for (j = 0; j < ELTS * 3; j++)		\
-    if (temp[j] != data[j])			\
+    if (!BITEQUAL (temp[j], data[j]))		\
       return 1;					\
   return 0;					\
 }
@@ -58,6 +58,8 @@ VARIANT (float32, 4, q_f32)
 
 #ifdef __aarch64__
 #define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (mfloat8, 8, _mf8)			\
+VARIANT (mfloat8, 16, q_mf8)			\
 VARIANT (float64, 1, _f64)			\
 VARIANT (float64, 2, q_f64)
 #else
@@ -69,7 +71,7 @@ VARIANTS (TESTMETH)
 
 #define CHECKS(BASE, ELTS, SUFFIX)	\
   if (test_vst1##SUFFIX##_x3 () != 0)	\
-    fprintf (stderr, "test_vst1##SUFFIX##_x3");
+    fprintf (stderr, "test_vst1##SUFFIX##_x3"), __builtin_abort ();
 
 int
 main (int argc, char **argv)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c
index 829a18d..ddc7fa5 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c
@@ -17,7 +17,7 @@ test_vst1##SUFFIX##_x4 ()				\
   BASE##x##ELTS##x##4##_t vectors;			\
   int i,j;						\
   for (i = 0; i < ELTS * 4; i++)			\
-    data [i] = (BASE##_t) 4*i;				\
+    data [i] = CONVERT (BASE##_t, 4*i);			\
   asm volatile ("" : : : "memory");			\
   vectors.val[0] = vld1##SUFFIX (data);			\
   vectors.val[1] = vld1##SUFFIX (&data[ELTS]);		\
@@ -26,7 +26,7 @@ test_vst1##SUFFIX##_x4 ()				\
   vst1##SUFFIX##_x4 (temp, vectors);			\
   asm volatile ("" : : : "memory");			\
   for (j = 0; j < ELTS * 4; j++)			\
-    if (temp[j] != data[j])				\
+    if (!BITEQUAL (temp[j], data[j]))			\
       return 1;						\
   return 0;						\
 }
@@ -61,6 +61,8 @@ VARIANT (float32, 4, q_f32)
 
 #ifdef __aarch64__
 #define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (mfloat8, 8, _mf8)			\
+VARIANT (mfloat8, 16, q_mf8)			\
 VARIANT (float64, 1, _f64)			\
 VARIANT (float64, 2, q_f64)
 #else
@@ -72,7 +74,7 @@ VARIANTS (TESTMETH)
 
 #define CHECKS(BASE, ELTS, SUFFIX)	\
   if (test_vst1##SUFFIX##_x4 () != 0)	\
-    fprintf (stderr, "test_vst1##SUFFIX##_x4");
+    fprintf (stderr, "test_vst1##SUFFIX##_x4"), __builtin_abort ();
 
 int
 main (int argc, char **argv)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
index 45062d9..4ca5a4bd 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
@@ -14,6 +14,10 @@ VECT_VAR_DECL(expected_st2_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_st2_0,poly,8,8) [] = { 0xf0, 0xf1, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0x0, 0x0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_st2_0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_st2_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_st2_0,int,16,8) [] = { 0xfff0, 0xfff1, 0x0, 0x0,
@@ -42,6 +46,10 @@ VECT_VAR_DECL(expected_st2_1,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_st2_1,hmfloat,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_st2_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -68,6 +76,10 @@ VECT_VAR_DECL(expected_st3_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_st3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_st3_0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_st3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0x0 };
 VECT_VAR_DECL(expected_st3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_st3_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0,
@@ -97,6 +109,10 @@ VECT_VAR_DECL(expected_st3_1,uint,32,2) [] = { 0xfffffff2, 0x0 };
 VECT_VAR_DECL(expected_st3_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_st3_1,hmfloat,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_st3_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_1,hfloat,32,2) [] = { 0xc1600000, 0x0 };
 VECT_VAR_DECL(expected_st3_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -123,6 +139,10 @@ VECT_VAR_DECL(expected_st3_2,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_st3_2,hmfloat,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_st3_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -149,6 +169,10 @@ VECT_VAR_DECL(expected_st4_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_st4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_st4_0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						 0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_st4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_st4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_st4_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
@@ -178,6 +202,10 @@ VECT_VAR_DECL(expected_st4_1,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
 VECT_VAR_DECL(expected_st4_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_st4_1,hmfloat,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_st4_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_st4_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -204,6 +232,10 @@ VECT_VAR_DECL(expected_st4_2,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_st4_2,hmfloat,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_st4_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -230,6 +262,10 @@ VECT_VAR_DECL(expected_st4_3,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_st4_3,hmfloat,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_st4_3,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -256,6 +292,9 @@ VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer_vld2_lane, mfloat, 8, 2)[2];
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
 #endif
@@ -272,6 +311,9 @@ VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer_vld3_lane, mfloat, 8, 3)[3];
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
 #endif
@@ -288,6 +330,9 @@ VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(buffer_vld4_lane, mfloat, 8, 4)[4];
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
 #endif
@@ -347,6 +392,7 @@ void exec_vstX_lane (void)
   DECL_VSTX_LANE(uint, 32, 2, X);		\
   DECL_VSTX_LANE(poly, 8, 8, X);		\
   DECL_VSTX_LANE(poly, 16, 4, X);		\
+  MFLOAT8_ONLY(DECL_VSTX_LANE(mfloat, 8, 8, X);) \
   DECL_VSTX_LANE(float, 32, 2, X);		\
   DECL_VSTX_LANE(int, 16, 8, X);		\
   DECL_VSTX_LANE(int, 32, 4, X);		\
@@ -378,6 +424,7 @@ void exec_vstX_lane (void)
   TEST_VSTX_LANE(, uint, u, 32, 2, X, 1);	\
   TEST_VSTX_LANE(, poly, p, 8, 8, X, 4);	\
   TEST_VSTX_LANE(, poly, p, 16, 4, X, 3);	\
+  MFLOAT8_ONLY(TEST_VSTX_LANE(, mfloat, mf, 8, 8, X, 5)); \
   TEST_VSTX_LANE(q, int, s, 16, 8, X, 6);	\
   TEST_VSTX_LANE(q, int, s, 32, 4, X, 2);	\
   TEST_VSTX_LANE(q, uint, u, 16, 8, X, 5);	\
@@ -403,6 +450,7 @@ void exec_vstX_lane (void)
   TEST_EXTRA_CHUNK(uint, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  MFLOAT8_ONLY(TEST_EXTRA_CHUNK(mfloat, 8, 8, X, Y)); \
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(int, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(int, 32, 4, X, Y);		\
@@ -420,6 +468,15 @@ void exec_vstX_lane (void)
 #define TEST_ALL_EXTRA_CHUNKS(X,Y) TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)
 #endif
 
+#if MFLOAT8_SUPPORTED
+  __builtin_memcpy (VECT_VAR(buffer_vld2_lane, mfloat, 8, 2),
+		    VECT_VAR(buffer_vld2_lane, int, 8, 2), 2);
+  __builtin_memcpy (VECT_VAR(buffer_vld3_lane, mfloat, 8, 3),
+		    VECT_VAR(buffer_vld3_lane, int, 8, 3), 3);
+  __builtin_memcpy (VECT_VAR(buffer_vld4_lane, mfloat, 8, 4),
+		    VECT_VAR(buffer_vld4_lane, int, 8, 4), 4);
+#endif
+
   /* Declare the temporary buffers / variables.  */
   DECL_ALL_VSTX_LANE(2);
   DECL_ALL_VSTX_LANE(3);
@@ -434,6 +491,9 @@ void exec_vstX_lane (void)
   DUMMY_ARRAY(buffer_src, uint, 32, 2, 4);
   DUMMY_ARRAY(buffer_src, poly, 8, 8, 4);
   DUMMY_ARRAY(buffer_src, poly, 16, 4, 4);
+#if MFLOAT8_SUPPORTED
+  DUMMY_ARRAY(buffer_src, mfloat, 8, 8, 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
 #endif
@@ -462,6 +522,7 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_0, CMT);
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_0, CMT);
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_0, CMT);
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_st2_0, CMT));
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_0, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_0, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_0, CMT);
@@ -485,6 +546,7 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_1, CMT);
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_1, CMT);
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_1, CMT);
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_st2_1, CMT));
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_1, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_1, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_1, CMT);
@@ -514,6 +576,7 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_0, CMT);
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_0, CMT);
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_0, CMT);
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_st3_0, CMT));
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_0, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_0, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_0, CMT);
@@ -538,6 +601,7 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_1, CMT);
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_1, CMT);
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_1, CMT);
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_st3_1, CMT));
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_1, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_1, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_1, CMT);
@@ -562,6 +626,7 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_2, CMT);
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_2, CMT);
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_2, CMT);
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_st3_2, CMT));
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_2, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_2, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_2, CMT);
@@ -591,6 +656,7 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_0, CMT);
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_0, CMT);
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_0, CMT);
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_st4_0, CMT));
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_0, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_0, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_0, CMT);
@@ -615,6 +681,7 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_1, CMT);
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_1, CMT);
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_1, CMT);
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_st4_1, CMT));
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_1, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_1, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_1, CMT);
@@ -639,6 +706,7 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_2, CMT);
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_2, CMT);
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_2, CMT);
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_st4_2, CMT));
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_2, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_2, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_2, CMT);
@@ -663,6 +731,7 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_3, CMT);
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_3, CMT);
   CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_3, CMT);
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_st4_3, CMT));
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_3, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_3, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_3, CMT);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c
index c3e1d9b..7d6e0a6 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c
@@ -9,6 +9,10 @@ VECT_VAR_DECL(expected_vtbl1,uint,8,8) [] = { 0x0, 0xf3, 0xf3, 0xf3,
 					      0x0, 0x0, 0xf3, 0xf3 };
 VECT_VAR_DECL(expected_vtbl1,poly,8,8) [] = { 0x0, 0xf3, 0xf3, 0xf3,
 					      0x0, 0x0, 0xf3, 0xf3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vtbl1,hmfloat,8,8) [] = { 0x0, 0xf3, 0xf3, 0xf3,
+						 0x0, 0x0, 0xf3, 0xf3 };
+#endif
 
 /* Expected results for vtbl2.  */
 VECT_VAR_DECL(expected_vtbl2,int,8,8) [] = { 0xf6, 0xf3, 0xf3, 0xf3,
@@ -17,6 +21,10 @@ VECT_VAR_DECL(expected_vtbl2,uint,8,8) [] = { 0xf6, 0xf5, 0xf5, 0xf5,
 					      0x0, 0x0, 0xf5, 0xf5 };
 VECT_VAR_DECL(expected_vtbl2,poly,8,8) [] = { 0xf6, 0xf5, 0xf5, 0xf5,
 					      0x0, 0x0, 0xf5, 0xf5 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vtbl2,hmfloat,8,8) [] = { 0xf6, 0xf5, 0xf5, 0xf5,
+						 0x0, 0x0, 0xf5, 0xf5 };
+#endif
 
 /* Expected results for vtbl3.  */
 VECT_VAR_DECL(expected_vtbl3,int,8,8) [] = { 0xf8, 0xf4, 0xf4, 0xf4,
@@ -25,6 +33,10 @@ VECT_VAR_DECL(expected_vtbl3,uint,8,8) [] = { 0xf8, 0xf7, 0xf7, 0xf7,
 					      0xff, 0x0, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected_vtbl3,poly,8,8) [] = { 0xf8, 0xf7, 0xf7, 0xf7,
 					      0xff, 0x0, 0xf7, 0xf7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vtbl3,hmfloat,8,8) [] = { 0xf8, 0xf7, 0xf7, 0xf7,
+						 0xff, 0x0, 0xf7, 0xf7 };
+#endif
 
 /* Expected results for vtbl4.  */
 VECT_VAR_DECL(expected_vtbl4,int,8,8) [] = { 0xfa, 0xf5, 0xf5, 0xf5,
@@ -33,6 +45,10 @@ VECT_VAR_DECL(expected_vtbl4,uint,8,8) [] = { 0xfa, 0xf9, 0xf9, 0xf9,
 					     0x3, 0x0, 0xf9, 0xf9 };
 VECT_VAR_DECL(expected_vtbl4,poly,8,8) [] = { 0xfa, 0xf9, 0xf9, 0xf9,
 					     0x3, 0x0, 0xf9, 0xf9 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vtbl4,hmfloat,8,8) [] = { 0xfa, 0xf9, 0xf9, 0xf9,
+						 0x3, 0x0, 0xf9, 0xf9 };
+#endif
 
 /* Expected results for vtbx1.  */
 VECT_VAR_DECL(expected_vtbx1,int,8,8) [] = { 0x33, 0xf2, 0xf2, 0xf2,
@@ -41,6 +57,10 @@ VECT_VAR_DECL(expected_vtbx1,uint,8,8) [] = { 0xcc, 0xf3, 0xf3, 0xf3,
 					     0xcc, 0xcc, 0xf3, 0xf3 };
 VECT_VAR_DECL(expected_vtbx1,poly,8,8) [] = { 0xcc, 0xf3, 0xf3, 0xf3,
 					     0xcc, 0xcc, 0xf3, 0xf3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vtbx1,hmfloat,8,8) [] = { 0x55, 0xf3, 0xf3, 0xf3,
+						 0x55, 0x55, 0xf3, 0xf3 };
+#endif
 
 /* Expected results for vtbx2.  */
 VECT_VAR_DECL(expected_vtbx2,int,8,8) [] = { 0xf6, 0xf3, 0xf3, 0xf3,
@@ -49,6 +69,10 @@ VECT_VAR_DECL(expected_vtbx2,uint,8,8) [] = { 0xf6, 0xf5, 0xf5, 0xf5,
 					     0xcc, 0xcc, 0xf5, 0xf5 };
 VECT_VAR_DECL(expected_vtbx2,poly,8,8) [] = { 0xf6, 0xf5, 0xf5, 0xf5,
 					     0xcc, 0xcc, 0xf5, 0xf5 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vtbx2,hmfloat,8,8) [] = { 0xf6, 0xf5, 0xf5, 0xf5,
+						 0x55, 0x55, 0xf5, 0xf5 };
+#endif
 
 /* Expected results for vtbx3.  */
 VECT_VAR_DECL(expected_vtbx3,int,8,8) [] = { 0xf8, 0xf4, 0xf4, 0xf4,
@@ -57,6 +81,10 @@ VECT_VAR_DECL(expected_vtbx3,uint,8,8) [] = { 0xf8, 0xf7, 0xf7, 0xf7,
 					     0xff, 0xcc, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected_vtbx3,poly,8,8) [] = { 0xf8, 0xf7, 0xf7, 0xf7,
 					     0xff, 0xcc, 0xf7, 0xf7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vtbx3,hmfloat,8,8) [] = { 0xf8, 0xf7, 0xf7, 0xf7,
+						 0xff, 0x55, 0xf7, 0xf7 };
+#endif
 
 /* Expected results for vtbx4.  */
 VECT_VAR_DECL(expected_vtbx4,int,8,8) [] = { 0xfa, 0xf5, 0xf5, 0xf5,
@@ -65,6 +93,10 @@ VECT_VAR_DECL(expected_vtbx4,uint,8,8) [] = { 0xfa, 0xf9, 0xf9, 0xf9,
 					      0x3, 0xcc, 0xf9, 0xf9 };
 VECT_VAR_DECL(expected_vtbx4,poly,8,8) [] = { 0xfa, 0xf9, 0xf9, 0xf9,
 					      0x3, 0xcc, 0xf9, 0xf9 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected_vtbx4,hmfloat,8,8) [] = { 0xfa, 0xf9, 0xf9, 0xf9,
+						 0x3, 0x55, 0xf9, 0xf9 };
+#endif
 
 void exec_vtbX (void)
 {
@@ -105,32 +137,38 @@ void exec_vtbX (void)
   DECL_VARIABLE(vector_res, int, 8, 8);
   DECL_VARIABLE(vector_res, uint, 8, 8);
   DECL_VARIABLE(vector_res, poly, 8, 8);
+  MFLOAT8_ONLY(DECL_VARIABLE(vector_res, mfloat, 8, 8));
 
   /* For vtbl1.  */
   DECL_VARIABLE(table_vector, int, 8, 8);
   DECL_VARIABLE(table_vector, uint, 8, 8);
   DECL_VARIABLE(table_vector, poly, 8, 8);
+  MFLOAT8_ONLY(DECL_VARIABLE(table_vector, mfloat, 8, 8));
 
   /* For vtbx*.  */
   DECL_VARIABLE(default_vector, int, 8, 8);
   DECL_VARIABLE(default_vector, uint, 8, 8);
   DECL_VARIABLE(default_vector, poly, 8, 8);
+  MFLOAT8_ONLY(DECL_VARIABLE(default_vector, mfloat, 8, 8));
 
   /* We need only 8 bits variants.  */
 #define DECL_ALL_VTBLX(X)			\
   DECL_VTBX(int, 8, 8, X);			\
   DECL_VTBX(uint, 8, 8, X);			\
-  DECL_VTBX(poly, 8, 8, X)
+  DECL_VTBX(poly, 8, 8, X);			\
+  MFLOAT8_ONLY(DECL_VTBX(mfloat, 8, 8, X))
 
 #define TEST_ALL_VTBL1()			\
   TEST_VTBL1(int, s, int, 8, 8);		\
   TEST_VTBL1(uint, u, uint, 8, 8);		\
-  TEST_VTBL1(poly, p, uint, 8, 8)
+  TEST_VTBL1(poly, p, uint, 8, 8);		\
+  MFLOAT8_ONLY(TEST_VTBL1(mfloat, mf, uint, 8, 8))
 
 #define TEST_ALL_VTBLX(X)			\
   TEST_VTBLX(int, s, int, 8, 8, X);		\
   TEST_VTBLX(uint, u, uint, 8, 8, X);		\
-  TEST_VTBLX(poly, p, uint, 8, 8, X)
+  TEST_VTBLX(poly, p, uint, 8, 8, X);		\
+  MFLOAT8_ONLY(TEST_VTBLX(mfloat, mf, uint, 8, 8, X))
 
   /* Declare the temporary buffers / variables.  */
   DECL_ALL_VTBLX(2);
@@ -168,6 +206,7 @@ void exec_vtbX (void)
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl1, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl1, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl1, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vtbl1, ""));
 
   /* Check vtbl2.  */
   clean_results ();
@@ -178,6 +217,7 @@ void exec_vtbX (void)
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl2, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl2, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl2, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vtbl2, ""));
 
   /* Check vtbl3.  */
   clean_results ();
@@ -188,6 +228,7 @@ void exec_vtbX (void)
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl3, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl3, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl3, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vtbl3, ""));
 
   /* Check vtbl4.  */
   clean_results ();
@@ -198,6 +239,7 @@ void exec_vtbX (void)
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl4, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl4, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl4, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vtbl4, ""));
 
 
   /* Now test VTBX.  */
@@ -229,17 +271,20 @@ void exec_vtbX (void)
 #define TEST_ALL_VTBX1()			\
   TEST_VTBX1(int, s, int, 8, 8);		\
   TEST_VTBX1(uint, u, uint, 8, 8);		\
-  TEST_VTBX1(poly, p, uint, 8, 8)
+  TEST_VTBX1(poly, p, uint, 8, 8);		\
+  MFLOAT8_ONLY(TEST_VTBX1(mfloat, mf, uint, 8, 8))
 
 #define TEST_ALL_VTBXX(X)			\
   TEST_VTBXX(int, s, int, 8, 8, X);		\
   TEST_VTBXX(uint, u, uint, 8, 8, X);		\
-  TEST_VTBXX(poly, p, uint, 8, 8, X)
+  TEST_VTBXX(poly, p, uint, 8, 8, X);		\
+  MFLOAT8_ONLY(TEST_VTBXX(mfloat, mf, uint, 8, 8, X))
 
   /* Choose init value arbitrarily, will be used as default value.  */
   VDUP(default_vector, , int, s, 8, 8, 0x33);
   VDUP(default_vector, , uint, u, 8, 8, 0xCC);
   VDUP(default_vector, , poly, p, 8, 8, 0xCC);
+  MFLOAT8_ONLY(VDUP(default_vector, , mfloat, mf, 8, 8, MFLOAT8(0x55)));
 
   /* Check vtbx1.  */
   clean_results ();
@@ -250,6 +295,7 @@ void exec_vtbX (void)
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx1, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx1, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx1, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vtbx1, ""));
 
   /* Check vtbx2.  */
   clean_results ();
@@ -260,6 +306,7 @@ void exec_vtbX (void)
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx2, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx2, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx2, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vtbx2, ""));
 
   /* Check vtbx3.  */
   clean_results ();
@@ -270,6 +317,7 @@ void exec_vtbX (void)
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx3, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx3, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx3, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vtbx3, ""));
 
   /* Check vtbx4.  */
   clean_results ();
@@ -280,6 +328,7 @@ void exec_vtbX (void)
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx4, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx4, "");
   CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx4, "");
+  MFLOAT8_ONLY(CHECK_FP(TEST_MSG, mfloat, 8, 8, PRIx8, expected_vtbx4, ""));
 }
 
 int main (void)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
index ea2d8d8..9e13bc1 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
@@ -15,6 +15,10 @@ VECT_VAR_DECL(expected0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf1, 0x55, 0x55,
 					 0xf2, 0xf3, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff1, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xaa, 0xaa,
+					    0xf2, 0xf3, 0xaa, 0xaa };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
 						0x4b4d, 0x4b4d };
@@ -40,6 +44,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf1, 0x55, 0x55,
 					  0xf6, 0xf7, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff1, 0x66, 0x66,
 					  0xfff2, 0xfff3, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,16) [] = { 0xf0, 0xf1, 0xbc, 0xbc,
+					     0xf2, 0xf3, 0xbc, 0xbc,
+					     0xf4, 0xf5, 0xbc, 0xbc,
+					     0xf6, 0xf7, 0xbc, 0xbc };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
 						0x4b4d, 0x4b4d,
@@ -61,6 +71,10 @@ VECT_VAR_DECL(expected1,uint,32,2) [] = { 0x77, 0x77 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf4, 0xf5, 0x55, 0x55,
 					 0xf6, 0xf7, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff2, 0xfff3, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,8) [] = { 0xf4, 0xf5, 0xaa, 0xaa,
+					    0xf6, 0xf7, 0xaa, 0xaa };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0xcb00, 0xca80,
 						0x4b4d, 0x4b4d };
@@ -86,6 +100,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf8, 0xf9, 0x55, 0x55,
 					  0xfe, 0xff, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff4, 0xfff5, 0x66, 0x66,
 					  0xfff6, 0xfff7, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,16) [] = { 0xf8, 0xf9, 0xbc, 0xbc,
+					     0xfa, 0xfb, 0xbc, 0xbc,
+					     0xfc, 0xfd, 0xbc, 0xbc,
+					     0xfe, 0xff, 0xbc, 0xbc };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0xca00, 0xc980,
 						0x4b4d, 0x4b4d,
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn_half.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn_half.c
index 25a0f19..6debfe5 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn_half.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn_half.c
@@ -20,6 +20,10 @@ VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0x55, 0xf2, 0x55,
 					0xf4, 0x55, 0xf6, 0x55 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0x66, 0xfff2, 0x66 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x42066666 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf0, 0x29, 0xf2, 0x29,
+					   0xf4, 0x29, 0xf6, 0x29 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0x4b4d,
 					       0xcb00, 0x4b4d };
@@ -50,6 +54,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0x55, 0xf2, 0x55,
 					 0xfc, 0x55, 0xfe, 0x55 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0x66, 0xfff2, 0x66,
 					 0xfff4, 0x66, 0xfff6, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xf0, 0xea, 0xf2, 0xea,
+					    0xf4, 0xea, 0xf6, 0xea,
+					    0xf8, 0xea, 0xfa, 0xea,
+					    0xfc, 0xea, 0xfe, 0xea };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0x4b4d,
 					       0xcb00, 0x4b4d,
@@ -82,6 +92,10 @@ void exec_vtrn_half (void)
   CLEAN(expected, uint, 64, 1);
 
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if MFLOAT8_SUPPORTED
+  VLOAD(vector, buffer, , mfloat, mf, 8, 8);
+  VLOAD(vector, buffer, q, mfloat, mf, 8, 16);
+#endif
 #if defined (FP16_SUPPORTED)
   VLOAD(vector, buffer, , float, f, 16, 4);
   VLOAD(vector, buffer, q, float, f, 16, 8);
@@ -99,6 +113,7 @@ void exec_vtrn_half (void)
   VDUP(vector2, , uint, u, 32, 2, 0x77);
   VDUP(vector2, , poly, p, 8, 8, 0x55);
   VDUP(vector2, , poly, p, 16, 4, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, , mfloat, mf, 8, 8, MFLOAT8(0x29)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
 #endif
@@ -114,6 +129,7 @@ void exec_vtrn_half (void)
   VDUP(vector2, q, uint, u, 64, 2, 0x88);
   VDUP(vector2, q, poly, p, 8, 16, 0x55);
   VDUP(vector2, q, poly, p, 16, 8, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, q, mfloat, mf, 8, 16, MFLOAT8(0xea)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, q, float, f, 16, 8, 14.6f);
 #endif
@@ -128,6 +144,7 @@ void exec_vtrn_half (void)
   TEST_VTRN1(, uint, u, 32, 2);
   TEST_VTRN1(, poly, p, 8, 8);
   TEST_VTRN1(, poly, p, 16, 4);
+  MFLOAT8_ONLY(TEST_VTRN1(, mfloat, mf, 8, 8));
 #if defined (FP16_SUPPORTED)
   TEST_VTRN1(, float, f, 16, 4);
 #endif
@@ -143,6 +160,7 @@ void exec_vtrn_half (void)
   TEST_VTRN1(q, uint, u, 64, 2);
   TEST_VTRN1(q, poly, p, 8, 16);
   TEST_VTRN1(q, poly, p, 16, 8);
+  MFLOAT8_ONLY(TEST_VTRN1(q, mfloat, mf, 8, 16));
 #if defined (FP16_SUPPORTED)
   TEST_VTRN1(q, float, f, 16, 8);
 #endif
@@ -174,6 +192,10 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf1, 0x55, 0xf3, 0x55,
 					 0xf5, 0x55, 0xf7, 0x55 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff1, 0x66, 0xfff3, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,8) [] = { 0xf1, 0x29, 0xf3, 0x29,
+					    0xf5, 0x29, 0xf7, 0x29 };
+#endif
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb80, 0x4b4d,
@@ -205,6 +227,12 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf1, 0x55, 0xf3, 0x55,
 					  0xfd, 0x55, 0xff, 0x55 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff1, 0x66, 0xfff3, 0x66,
 					  0xfff5, 0x66, 0xfff7, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,16) [] = { 0xf1, 0xea, 0xf3, 0xea,
+					     0xf5, 0xea, 0xf7, 0xea,
+					     0xf9, 0xea, 0xfb, 0xea,
+					     0xfd, 0xea, 0xff, 0xea };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xcb80, 0x4b4d,
 						0xca80, 0x4b4d,
@@ -225,6 +253,7 @@ VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0x42073333,
   TEST_VTRN2(, uint, u, 32, 2);
   TEST_VTRN2(, poly, p, 8, 8);
   TEST_VTRN2(, poly, p, 16, 4);
+  MFLOAT8_ONLY(TEST_VTRN2(, mfloat, mf, 8, 8));
 #if defined (FP16_SUPPORTED)
   TEST_VTRN2(, float, f, 16, 4);
 #endif
@@ -240,6 +269,7 @@ VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0x42073333,
   TEST_VTRN2(q, uint, u, 64, 2);
   TEST_VTRN2(q, poly, p, 8, 16);
   TEST_VTRN2(q, poly, p, 16, 8);
+  MFLOAT8_ONLY(TEST_VTRN2(q, mfloat, mf, 8, 16));
 #if defined (FP16_SUPPORTED)
   TEST_VTRN2(q, float, f, 16, 8);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
index 43b49ca..6b105ab 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
@@ -19,6 +19,10 @@ VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff1,
 					  0xfff2, 0xfff3 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					    0xf4, 0xf5, 0xf6, 0xf7 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
 						0xcb00, 0xca80 };
@@ -52,6 +56,12 @@ VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff1,
 					  0xfff2, 0xfff3,
 					  0xfff4, 0xfff5,
 					  0xfff6, 0xfff7 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+					     0xf4, 0xf5, 0xf6, 0xf7,
+					     0xf8, 0xf9, 0xfa, 0xfb,
+					     0xfc, 0xfd, 0xfe, 0xff };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
 						0xcb00, 0xca80,
@@ -73,6 +83,10 @@ VECT_VAR_DECL(expected1,uint,32,2) [] = { 0x77, 0x77 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0x55, 0x55, 0x55, 0x55,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0x66, 0x66, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
+					    0xaa, 0xaa, 0xaa, 0xaa };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0x4b4d, 0x4b4d,
 						0x4b4d, 0x4b4d };
@@ -98,6 +112,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0x55, 0x55, 0x55, 0x55,
 					  0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0x66, 0x66, 0x66, 0x66,
 					  0x66, 0x66, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,16) [] = { 0xbc, 0xbc, 0xbc, 0xbc,
+					     0xbc, 0xbc, 0xbc, 0xbc,
+					     0xbc, 0xbc, 0xbc, 0xbc,
+					     0xbc, 0xbc, 0xbc, 0xbc };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0x4b4d, 0x4b4d,
 						0x4b4d, 0x4b4d,
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp_half.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp_half.c
index 2e6b666..fe35e15 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp_half.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp_half.c
@@ -19,6 +19,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
 					0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff2, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
+					   0x7b, 0x7b, 0x7b, 0x7b };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x42066666 };
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb00,
@@ -49,6 +53,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff2, 0xfff4, 0xfff6,
 					 0x66, 0x66, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
+					    0xf8, 0xfa, 0xfc, 0xfe,
+					    0x92, 0x92, 0x92, 0x92,
+					    0x92, 0x92, 0x92, 0x92 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb00, 0xca00, 0xc900,
 					       0x4b4d, 0x4b4d, 0x4b4d, 0x4b4d };
@@ -79,6 +89,10 @@ void exec_vuzp_half (void)
   CLEAN(expected, uint, 64, 1);
 
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if MFLOAT8_SUPPORTED
+  VLOAD(vector, buffer, , mfloat, mf, 8, 8);
+  VLOAD(vector, buffer, q, mfloat, mf, 8, 16);
+#endif
 #if defined (FP16_SUPPORTED)
   VLOAD(vector, buffer, , float, f, 16, 4);
   VLOAD(vector, buffer, q, float, f, 16, 8);
@@ -96,6 +110,7 @@ void exec_vuzp_half (void)
   VDUP(vector2, , uint, u, 32, 2, 0x77);
   VDUP(vector2, , poly, p, 8, 8, 0x55);
   VDUP(vector2, , poly, p, 16, 4, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, , mfloat, mf, 8, 8, MFLOAT8(0x7b)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
 #endif
@@ -111,6 +126,7 @@ void exec_vuzp_half (void)
   VDUP(vector2, q, uint, u, 64, 2, 0x88);
   VDUP(vector2, q, poly, p, 8, 16, 0x55);
   VDUP(vector2, q, poly, p, 16, 8, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, q, mfloat, mf, 8, 16, MFLOAT8(0x92)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, q, float, f, 16, 8, 14.6f);
 #endif
@@ -125,6 +141,7 @@ void exec_vuzp_half (void)
   TEST_VUZP1(, uint, u, 32, 2);
   TEST_VUZP1(, poly, p, 8, 8);
   TEST_VUZP1(, poly, p, 16, 4);
+  MFLOAT8_ONLY(TEST_VUZP1(, mfloat, mf, 8, 8));
 #if defined (FP16_SUPPORTED)
   TEST_VUZP1(, float, f, 16, 4);
 #endif
@@ -140,6 +157,7 @@ void exec_vuzp_half (void)
   TEST_VUZP1(q, uint, u, 64, 2);
   TEST_VUZP1(q, poly, p, 8, 16);
   TEST_VUZP1(q, poly, p, 16, 8);
+  MFLOAT8_ONLY(TEST_VUZP1(q, mfloat, mf, 8, 16));
 #if defined (FP16_SUPPORTED)
   TEST_VUZP1(q, float, f, 16, 8);
 #endif
@@ -171,6 +189,10 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff1, 0xfff3, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
+					    0x7b, 0x7b, 0x7b, 0x7b };
+#endif
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb80, 0xca80,
@@ -201,6 +223,12 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
 					  0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff1, 0xfff3, 0xfff5, 0xfff7,
 					  0x66, 0x66, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,16) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
+					     0xf9, 0xfb, 0xfd, 0xff,
+					     0x92, 0x92, 0x92, 0x92,
+					     0x92, 0x92, 0x92, 0x92 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xcb80, 0xca80, 0xc980, 0xc880,
 						0x4b4d, 0x4b4d, 0x4b4d, 0x4b4d
@@ -221,6 +249,7 @@ VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0xc1500000,
   TEST_VUZP2(, uint, u, 32, 2);
   TEST_VUZP2(, poly, p, 8, 8);
   TEST_VUZP2(, poly, p, 16, 4);
+  MFLOAT8_ONLY(TEST_VUZP2(, mfloat, mf, 8, 8));
 #if defined (FP16_SUPPORTED)
   TEST_VUZP2(, float, f, 16, 4);
 #endif
@@ -236,6 +265,7 @@ VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0xc1500000,
   TEST_VUZP2(q, uint, u, 64, 2);
   TEST_VUZP2(q, poly, p, 8, 16);
   TEST_VUZP2(q, poly, p, 16, 8);
+  MFLOAT8_ONLY(TEST_VUZP2(q, mfloat, mf, 8, 16));
 #if defined (FP16_SUPPORTED)
   TEST_VUZP2(q, float, f, 16, 8);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
index 20f4f5d..766da27 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
@@ -18,6 +18,10 @@ VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf4, 0x55, 0x55,
 					 0xf1, 0xf5, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff2,
 					  0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,8) [] = { 0xf0, 0xf4, 0xaa, 0xaa,
+					    0xf1, 0xf5, 0xaa, 0xaa };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcb00,
 						0x4b4d, 0x4b4d };
@@ -45,6 +49,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf8, 0x55, 0x55,
 					  0xf3, 0xfb, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff4, 0x66, 0x66,
 					  0xfff1, 0xfff5, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected0,hmfloat,8,16) [] = { 0xf0, 0xf8, 0xbc, 0xbc,
+					     0xf1, 0xf9, 0xbc, 0xbc,
+					     0xf2, 0xfa, 0xbc, 0xbc,
+					     0xf3, 0xfb, 0xbc, 0xbc };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xca00,
 						0x4b4d, 0x4b4d,
@@ -69,6 +79,10 @@ VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf2, 0xf6, 0x55, 0x55,
 					 0xf3, 0xf7, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff3,
 					  0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,8) [] = { 0xf2, 0xf6, 0xaa, 0xaa,
+					    0xf3, 0xf7, 0xaa, 0xaa };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0xcb80, 0xca80,
 						0x4b4d, 0x4b4d };
@@ -96,6 +110,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf4, 0xfc, 0x55, 0x55,
 					  0xf7, 0xff, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff2, 0xfff6, 0x66, 0x66,
 					  0xfff3, 0xfff7, 0x66, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected1,hmfloat,8,16) [] = { 0xf4, 0xfc, 0xbc, 0xbc,
+					     0xf5, 0xfd, 0xbc, 0xbc,
+					     0xf6, 0xfe, 0xbc, 0xbc,
+					     0xf7, 0xff, 0xbc, 0xbc };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0xcb00, 0xc900,
 						0x4b4d, 0x4b4d,
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip_half.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip_half.c
index ef42451..5914192 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip_half.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip_half.c
@@ -20,6 +20,10 @@ VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0x55, 0xf1, 0x55,
 					0xf2, 0x55, 0xf3, 0x55 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0x66, 0xfff1, 0x66 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x42066666 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,8) [] = { 0xf0, 0xf9, 0xf1, 0xf9,
+					   0xf2, 0xf9, 0xf3, 0xf9 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0x4b4d,
 					       0xcb80, 0x4b4d };
@@ -50,6 +54,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0x55, 0xf1, 0x55,
 					 0xf6, 0x55, 0xf7, 0x55 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0x66, 0xfff1, 0x66,
 					 0xfff2, 0x66, 0xfff3, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected,hmfloat,8,16) [] = { 0xf0, 0xd6, 0xf1, 0xd6,
+					    0xf2, 0xd6, 0xf3, 0xd6,
+					    0xf4, 0xd6, 0xf5, 0xd6,
+					    0xf6, 0xd6, 0xf7, 0xd6 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0x4b4d,
 					       0xcb80, 0x4b4d,
@@ -82,6 +92,10 @@ void exec_vzip_half (void)
   CLEAN(expected, uint, 64, 1);
 
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if MFLOAT8_SUPPORTED
+  VLOAD(vector, buffer, , mfloat, mf, 8, 8);
+  VLOAD(vector, buffer, q, mfloat, mf, 8, 16);
+#endif
 #if defined (FP16_SUPPORTED)
   VLOAD(vector, buffer, , float, f, 16, 4);
   VLOAD(vector, buffer, q, float, f, 16, 8);
@@ -99,6 +113,7 @@ void exec_vzip_half (void)
   VDUP(vector2, , uint, u, 32, 2, 0x77);
   VDUP(vector2, , poly, p, 8, 8, 0x55);
   VDUP(vector2, , poly, p, 16, 4, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, , mfloat, mf, 8, 8, MFLOAT8(0xf9)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
 #endif
@@ -114,6 +129,7 @@ void exec_vzip_half (void)
   VDUP(vector2, q, uint, u, 64, 2, 0x88);
   VDUP(vector2, q, poly, p, 8, 16, 0x55);
   VDUP(vector2, q, poly, p, 16, 8, 0x66);
+  MFLOAT8_ONLY(VDUP(vector2, q, mfloat, mf, 8, 16, MFLOAT8(0xd6)));
 #if defined (FP16_SUPPORTED)
   VDUP (vector2, q, float, f, 16, 8, 14.6f);
 #endif
@@ -128,6 +144,7 @@ void exec_vzip_half (void)
   TEST_VZIP1(, uint, u, 32, 2);
   TEST_VZIP1(, poly, p, 8, 8);
   TEST_VZIP1(, poly, p, 16, 4);
+  MFLOAT8_ONLY(TEST_VZIP1(, mfloat, mf, 8, 8));
 #if defined (FP16_SUPPORTED)
   TEST_VZIP1(, float, f, 16, 4);
 #endif
@@ -143,6 +160,7 @@ void exec_vzip_half (void)
   TEST_VZIP1(q, uint, u, 64, 2);
   TEST_VZIP1(q, poly, p, 8, 16);
   TEST_VZIP1(q, poly, p, 16, 8);
+  MFLOAT8_ONLY(TEST_VZIP1(q, mfloat, mf, 8, 16));
 #if defined (FP16_SUPPORTED)
   TEST_VZIP1(q, float, f, 16, 8);
 #endif
@@ -175,6 +193,10 @@ VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf4, 0x55, 0xf5, 0x55,
 					 0xf6, 0x55, 0xf7, 0x55 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0x66, 0xfff3, 0x66 };
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,8) [] = { 0xf4, 0xf9, 0xf5, 0xf9,
+					    0xf6, 0xf9, 0xf7, 0xf9 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb00, 0x4b4d,
 						0xca80, 0x4b4d };
@@ -205,6 +227,12 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf8, 0x55, 0xf9, 0x55,
 					  0xfe, 0x55, 0xff, 0x55 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff4, 0x66, 0xfff5, 0x66,
 					  0xfff6, 0x66, 0xfff7, 0x66 };
+#if MFLOAT8_SUPPORTED
+VECT_VAR_DECL(expected2,hmfloat,8,16) [] = { 0xf8, 0xd6, 0xf9, 0xd6,
+					     0xfa, 0xd6, 0xfb, 0xd6,
+					     0xfc, 0xd6, 0xfd, 0xd6,
+					     0xfe, 0xd6, 0xff, 0xd6 };
+#endif
 #if defined (FP16_SUPPORTED)
 VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xca00, 0x4b4d,
 						0xc980, 0x4b4d,
@@ -225,6 +253,7 @@ VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0x42073333,
   TEST_VZIP2(, uint, u, 32, 2);
   TEST_VZIP2(, poly, p, 8, 8);
   TEST_VZIP2(, poly, p, 16, 4);
+  MFLOAT8_ONLY(TEST_VZIP2(, mfloat, mf, 8, 8));
 #if defined (FP16_SUPPORTED)
   TEST_VZIP2(, float, f, 16, 4);
 #endif
@@ -240,6 +269,7 @@ VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0x42073333,
   TEST_VZIP2(q, uint, u, 64, 2);
   TEST_VZIP2(q, poly, p, 8, 16);
   TEST_VZIP2(q, poly, p, 16, 8);
+  MFLOAT8_ONLY(TEST_VZIP2(q, mfloat, mf, 8, 16));
 #if defined (FP16_SUPPORTED)
   TEST_VZIP2(q, float, f, 16, 8);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut.c b/gcc/testsuite/gcc.target/aarch64/simd/lut.c
index fc89b21..51b7b9c 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/lut.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut.c
@@ -197,6 +197,70 @@ test_vluti2q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[])
 }
 
 /*
+** test_vluti2_lanemf8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti2_lanemf8(mfloat8x8_t a, uint8x8_t b, mfloat8x16_t results[])
+{
+  results[0] = vluti2_lane_mf8(a, b, 0);
+  results[1] = vluti2_lane_mf8(a, b, 1);
+}
+
+/*
+** test_vluti2_laneqmf8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2_laneqmf8(mfloat8x8_t a, uint8x16_t b, mfloat8x16_t results[])
+{
+  results[0] = vluti2_laneq_mf8(a, b, 0);
+  results[1] = vluti2_laneq_mf8(a, b, 1);
+  results[2] = vluti2_laneq_mf8(a, b, 2);
+  results[3] = vluti2_laneq_mf8(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanemf8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_lanemf8(mfloat8x16_t a, uint8x8_t b, mfloat8x16_t results[])
+{
+  results[0] = vluti2q_lane_mf8(a, b, 0);
+  results[1] = vluti2q_lane_mf8(a, b, 1);
+}
+
+/*
+** test_vluti2q_laneqmf8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_laneqmf8(mfloat8x16_t a, uint8x16_t b, mfloat8x16_t results[])
+{
+  results[0] = vluti2q_laneq_mf8(a, b, 0);
+  results[1] = vluti2q_laneq_mf8(a, b, 1);
+  results[2] = vluti2q_laneq_mf8(a, b, 2);
+  results[3] = vluti2q_laneq_mf8(a, b, 3);
+}
+
+/*
 ** test_vluti2_laneu16:
 **	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
 **	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
@@ -689,6 +753,32 @@ test_vluti4q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[])
 }
 
 /*
+** test_vluti4q_lanemf8:
+**	luti4	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_lanemf8(mfloat8x16_t a, uint8x8_t b, mfloat8x16_t results[])
+{
+  results[0] = vluti4q_lane_mf8(a, b, 0);
+}
+
+/*
+** test_vluti4q_laneqmf8:
+**	luti4	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_laneqmf8(mfloat8x16_t a, uint8x16_t b, mfloat8x16_t results[])
+{
+  results[0] = vluti4q_laneq_mf8(a, b, 0);
+  results[1] = vluti4q_laneq_mf8(a, b, 1);
+}
+
+/*
 ** test_vluti4q_laneu16_x2:
 **	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
 **	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_1.c b/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_1.c
new file mode 100644
index 0000000..a3fd9b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_1.c
@@ -0,0 +1,1822 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-O -std=gnu23 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** test_bsl1:
+**	bsl	v0.8b, v1.8b, v2.8b
+**	ret
+*/
+mfloat8x8_t test_bsl1(uint8x8_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vbsl_mf8(a, b, c);
+}
+
+/*
+** test_bsl2:
+**	bif	v0.8b, v2.8b, v1.8b
+**	ret
+*/
+mfloat8x8_t test_bsl2(mfloat8x8_t a, uint8x8_t b, mfloat8x8_t c)
+{
+  return vbsl_mf8(b, a, c);
+}
+
+/*
+** test_bsl3:
+**	bit	v0.8b, v2.8b, v1.8b
+**	ret
+*/
+mfloat8x8_t test_bsl3(mfloat8x8_t a, uint8x8_t b, mfloat8x8_t c)
+{
+  return vbsl_mf8(b, c, a);
+}
+
+/*
+** test_bslq1:
+**	bsl	v0.16b, v1.16b, v2.16b
+**	ret
+*/
+mfloat8x16_t test_bslq1(uint8x16_t a, mfloat8x16_t b, mfloat8x16_t c)
+{
+  return vbslq_mf8(a, b, c);
+}
+
+/*
+** test_bslq2:
+**	bif	v0.16b, v2.16b, v1.16b
+**	ret
+*/
+mfloat8x16_t test_bslq2(mfloat8x16_t a, uint8x16_t b, mfloat8x16_t c)
+{
+  return vbslq_mf8(b, a, c);
+}
+
+/*
+** test_bslq3:
+**	bit	v0.16b, v2.16b, v1.16b
+**	ret
+*/
+mfloat8x16_t test_bslq3(mfloat8x16_t a, uint8x16_t b, mfloat8x16_t c)
+{
+  return vbslq_mf8(b, c, a);
+}
+
+/*
+** test_combine1:
+**	uzp1	v0.2d, v1.2d, v2.2d
+**	ret
+*/
+mfloat8x16_t test_combine1(mfloat8_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vcombine_mf8(b, c);
+}
+
+/*
+** test_copy_lane1:
+**	ins	v0.b\[0\], v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_copy_lane1(mfloat8x8_t a, mfloat8x8_t b)
+{
+  return vcopy_lane_mf8(a, 0, b, 0);
+}
+
+/*
+** test_copy_lane2:
+**	ins	v0.b\[0\], v1.b\[7\]
+**	ret
+*/
+mfloat8x8_t test_copy_lane2(mfloat8x8_t a, mfloat8x8_t b)
+{
+  return vcopy_lane_mf8(a, 0, b, 7);
+}
+
+/*
+** test_copy_lane3:
+**	ins	v0.b\[7\], v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_copy_lane3(mfloat8x8_t a, mfloat8x8_t b)
+{
+  return vcopy_lane_mf8(a, 7, b, 0);
+}
+
+/*
+** test_copy_lane4:
+**	ins	v0.b\[5\], v1.b\[2\]
+**	ret
+*/
+mfloat8x8_t test_copy_lane4(mfloat8x8_t a, mfloat8x8_t b)
+{
+  return vcopy_lane_mf8(a, 5, b, 2);
+}
+
+/*
+** test_copy_laneq1:
+**	ins	v0.b\[0\], v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_copy_laneq1(mfloat8x8_t a, mfloat8x16_t b)
+{
+  return vcopy_laneq_mf8(a, 0, b, 0);
+}
+
+/*
+** test_copy_laneq2:
+**	ins	v0.b\[0\], v1.b\[15\]
+**	ret
+*/
+mfloat8x8_t test_copy_laneq2(mfloat8x8_t a, mfloat8x16_t b)
+{
+  return vcopy_laneq_mf8(a, 0, b, 15);
+}
+
+/*
+** test_copy_laneq3:
+**	ins	v0.b\[7\], v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_copy_laneq3(mfloat8x8_t a, mfloat8x16_t b)
+{
+  return vcopy_laneq_mf8(a, 7, b, 0);
+}
+
+/*
+** test_copy_laneq4:
+**	ins	v0.b\[6\], v1.b\[13\]
+**	ret
+*/
+mfloat8x8_t test_copy_laneq4(mfloat8x8_t a, mfloat8x16_t b)
+{
+  return vcopy_laneq_mf8(a, 6, b, 13);
+}
+
+/*
+** test_copyq_lane1:
+**	ins	v0.b\[0\], v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_copyq_lane1(mfloat8x16_t a, mfloat8x8_t b)
+{
+  return vcopyq_lane_mf8(a, 0, b, 0);
+}
+
+/*
+** test_copyq_lane2:
+**	ins	v0.b\[0\], v1.b\[7\]
+**	ret
+*/
+mfloat8x16_t test_copyq_lane2(mfloat8x16_t a, mfloat8x8_t b)
+{
+  return vcopyq_lane_mf8(a, 0, b, 7);
+}
+
+/*
+** test_copyq_lane3:
+**	ins	v0.b\[15\], v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_copyq_lane3(mfloat8x16_t a, mfloat8x8_t b)
+{
+  return vcopyq_lane_mf8(a, 15, b, 0);
+}
+
+/*
+** test_copyq_lane4:
+**	ins	v0.b\[11\], v1.b\[2\]
+**	ret
+*/
+mfloat8x16_t test_copyq_lane4(mfloat8x16_t a, mfloat8x8_t b)
+{
+  return vcopyq_lane_mf8(a, 11, b, 2);
+}
+
+/*
+** test_copyq_laneq1:
+**	ins	v0.b\[0\], v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_copyq_laneq1(mfloat8x16_t a, mfloat8x16_t b)
+{
+  return vcopyq_laneq_mf8(a, 0, b, 0);
+}
+
+/*
+** test_copyq_laneq2:
+**	ins	v0.b\[0\], v1.b\[15\]
+**	ret
+*/
+mfloat8x16_t test_copyq_laneq2(mfloat8x16_t a, mfloat8x16_t b)
+{
+  return vcopyq_laneq_mf8(a, 0, b, 15);
+}
+
+/*
+** test_copyq_laneq3:
+**	ins	v0.b\[15\], v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_copyq_laneq3(mfloat8x16_t a, mfloat8x16_t b)
+{
+  return vcopyq_laneq_mf8(a, 15, b, 0);
+}
+
+/*
+** test_copyq_laneq4:
+**	ins	v0.b\[9\], v1.b\[13\]
+**	ret
+*/
+mfloat8x16_t test_copyq_laneq4(mfloat8x16_t a, mfloat8x16_t b)
+{
+  return vcopyq_laneq_mf8(a, 9, b, 13);
+}
+
+/*
+** test_create1:
+**	fmov	d0, x0
+**	ret
+*/
+mfloat8x8_t test_create1(uint64_t a)
+{
+  return vcreate_mf8(a);
+}
+
+/*
+** test_create2:
+**	movi	d0, #?0xffff
+**	ret
+*/
+mfloat8x8_t test_create2()
+{
+  return vcreate_mf8(0xffff);
+}
+
+/*
+** test_dup1:
+**	dup	v0.8b, v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_dup1(mfloat8_t a, mfloat8_t b)
+{
+  return vdup_n_mf8(b);
+}
+
+/*
+** test_dup2:
+**	movi	v0.2s, #?0
+**	ret
+*/
+mfloat8x8_t test_dup2()
+{
+  return vdup_n_mf8(((union { uint8_t x; mfloat8_t y; }) { 0 }).y);
+}
+
+/*
+** test_dup3:
+**	movi	v0.8b, #?0xf
+**	ret
+*/
+mfloat8x8_t test_dup3()
+{
+  return vdup_n_mf8(((union { uint8_t x; mfloat8_t y; }) { 0x0f }).y);
+}
+
+/*
+** test_dupq1:
+**	dup	v0.16b, v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_dupq1(mfloat8_t a, mfloat8_t b)
+{
+  return vdupq_n_mf8(b);
+}
+
+/*
+** test_dupq2:
+**	movi	v0.4s, #?0
+**	ret
+*/
+mfloat8x16_t test_dupq2()
+{
+  return vdupq_n_mf8(((union { uint8_t x; mfloat8_t y; }) { 0 }).y);
+}
+
+/*
+** test_dupq3:
+**	movi	v0.16b, #?0xf
+**	ret
+*/
+mfloat8x16_t test_dupq3()
+{
+  return vdupq_n_mf8(((union { uint8_t x; mfloat8_t y; }) { 0x0f }).y);
+}
+
+/*
+** test_dup_lane1:
+**	dup	v0.8b, v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_dup_lane1(mfloat8_t a, mfloat8x8_t b)
+{
+  return vdup_lane_mf8(b, 0);
+}
+
+/*
+** test_dup_lane2:
+**	dup	v0.8b, v1.b\[7\]
+**	ret
+*/
+mfloat8x8_t test_dup_lane2(mfloat8_t a, mfloat8x8_t b)
+{
+  return vdup_lane_mf8(b, 7);
+}
+
+/*
+** test_dup_laneq1:
+**	dup	v0.8b, v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_dup_laneq1(mfloat8_t a, mfloat8x16_t b)
+{
+  return vdup_laneq_mf8(b, 0);
+}
+
+/*
+** test_dup_laneq2:
+**	dup	v0.8b, v1.b\[15\]
+**	ret
+*/
+mfloat8x8_t test_dup_laneq2(mfloat8_t a, mfloat8x16_t b)
+{
+  return vdup_laneq_mf8(b, 15);
+}
+
+/*
+** test_dupq_lane1:
+**	dup	v0.16b, v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_dupq_lane1(mfloat8_t a, mfloat8x8_t b)
+{
+  return vdupq_lane_mf8(b, 0);
+}
+
+/*
+** test_dupq_lane2:
+**	dup	v0.16b, v1.b\[7\]
+**	ret
+*/
+mfloat8x16_t test_dupq_lane2(mfloat8_t a, mfloat8x8_t b)
+{
+  return vdupq_lane_mf8(b, 7);
+}
+
+/*
+** test_dupq_laneq1:
+**	dup	v0.16b, v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_dupq_laneq1(mfloat8_t a, mfloat8x16_t b)
+{
+  return vdupq_laneq_mf8(b, 0);
+}
+
+/*
+** test_dupq_laneq2:
+**	dup	v0.16b, v1.b\[15\]
+**	ret
+*/
+mfloat8x16_t test_dupq_laneq2(mfloat8_t a, mfloat8x16_t b)
+{
+  return vdupq_laneq_mf8(b, 15);
+}
+
+/*
+** test_dupb_lane1:
+**	dup	b0, v1.b\[0\]
+**	ret
+*/
+mfloat8_t test_dupb_lane1(mfloat8_t a, mfloat8x8_t b)
+{
+  return vdupb_lane_mf8(b, 0);
+}
+
+/*
+** test_dupb_lane2:
+**	dup	b0, v1.b\[7\]
+**	ret
+*/
+mfloat8_t test_dupb_lane2(mfloat8_t a, mfloat8x8_t b)
+{
+  return vdupb_lane_mf8(b, 7);
+}
+
+/*
+** test_dupb_laneq1:
+**	dup	b0, v1.b\[0\]
+**	ret
+*/
+mfloat8_t test_dupb_laneq1(mfloat8_t a, mfloat8x16_t b)
+{
+  return vdupb_laneq_mf8(b, 0);
+}
+
+/*
+** test_dupb_laneq2:
+**	dup	b0, v1.b\[15\]
+**	ret
+*/
+mfloat8_t test_dupb_laneq2(mfloat8_t a, mfloat8x16_t b)
+{
+  return vdupb_laneq_mf8(b, 15);
+}
+
+/*
+** test_ext1:
+**	ext	v0.8b, v0.8b, v1.8b, #1
+**	ret
+*/
+mfloat8x8_t test_ext1(mfloat8x8_t a, mfloat8x8_t b)
+{
+  return vext_mf8(a, b, 1);
+}
+
+/*
+** test_ext2:
+**	ext	v0.8b, v1.8b, v2.8b, #7
+**	ret
+*/
+mfloat8x8_t test_ext2(mfloat8x8_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vext_mf8(b, c, 7);
+}
+
+/*
+** test_extq1:
+**	ext	v0.16b, v0.16b, v1.16b, #1
+**	ret
+*/
+mfloat8x16_t test_extq1(mfloat8x16_t a, mfloat8x16_t b)
+{
+  return vextq_mf8(a, b, 1);
+}
+
+/*
+** test_extq2:
+**	ext	v0.16b, v1.16b, v2.16b, #15
+**	ret
+*/
+mfloat8x16_t test_extq2(mfloat8x16_t a, mfloat8x16_t b, mfloat8x16_t c)
+{
+  return vextq_mf8(b, c, 15);
+}
+
+/*
+** test_ld1:	{ target { le && lp64 } }
+**	ldr	d0, \[x0\]
+**	ret
+*/
+/*
+** test_ld1:	{ target { be && lp64 } }
+**	ld1	{v0.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8_t test_ld1(const mfloat8_t *ptr)
+{
+  return vld1_mf8(ptr);
+}
+
+/*
+** test_ld1q:	{ target { le && lp64 } }
+**	ldr	q0, \[x0\]
+**	ret
+*/
+/*
+** test_ld1q:	{ target { be && lp64 } }
+**	ld1	{v0.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16_t test_ld1q(const mfloat8_t *ptr)
+{
+  return vld1q_mf8(ptr);
+}
+
+/*
+** test_ld1_dup:	{ target lp64 }
+**	ld1r	{v0.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8_t test_ld1_dup(const mfloat8_t *ptr)
+{
+  return vld1_dup_mf8(ptr);
+}
+
+/*
+** test_ld1q_dup:	{ target lp64 }
+**	ld1r	{v0.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16_t test_ld1q_dup(const mfloat8_t *ptr)
+{
+  return vld1q_dup_mf8(ptr);
+}
+
+/*
+** test_ld1_lane1:	{ target lp64 }
+**	ld1	{v0.b}\[0\], \[x0\]
+**	ret
+*/
+mfloat8x8_t test_ld1_lane1(const mfloat8_t *ptr, mfloat8x8_t a)
+{
+  return vld1_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_ld1_lane2:	{ target lp64 }
+**	ld1	{v0.b}\[7\], \[x0\]
+**	ret
+*/
+mfloat8x8_t test_ld1_lane2(const mfloat8_t *ptr, mfloat8x8_t a)
+{
+  return vld1_lane_mf8(ptr, a, 7);
+}
+
+/*
+** test_ld1q_lane1:	{ target lp64 }
+**	ld1	{v0.b}\[0\], \[x0\]
+**	ret
+*/
+mfloat8x16_t test_ld1q_lane1(const mfloat8_t *ptr, mfloat8x16_t a)
+{
+  return vld1q_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_ld1q_lane2:	{ target lp64 }
+**	ld1	{v0.b}\[15\], \[x0\]
+**	ret
+*/
+mfloat8x16_t test_ld1q_lane2(const mfloat8_t *ptr, mfloat8x16_t a)
+{
+  return vld1q_lane_mf8(ptr, a, 15);
+}
+
+/*
+** test_ld1_x2:	{ target lp64 }
+**	ld1	{v0.8b( - |, )v1.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8x2_t test_ld1_x2(const mfloat8_t *ptr)
+{
+  return vld1_mf8_x2(ptr);
+}
+
+/*
+** test_ld1q_x2:	{ target lp64 }
+**	ld1	{v0.16b( - |, )v1.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16x2_t test_ld1q_x2(const mfloat8_t *ptr)
+{
+  return vld1q_mf8_x2(ptr);
+}
+
+/*
+** test_ld1_x3:	{ target lp64 }
+**	ld1	{v0.8b - v2.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8x3_t test_ld1_x3(const mfloat8_t *ptr)
+{
+  return vld1_mf8_x3(ptr);
+}
+
+/*
+** test_ld1q_x3:	{ target lp64 }
+**	ld1	{v0.16b - v2.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16x3_t test_ld1q_x3(const mfloat8_t *ptr)
+{
+  return vld1q_mf8_x3(ptr);
+}
+
+/*
+** test_ld1_x4:	{ target lp64 }
+**	ld1	{v0.8b - v3.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8x4_t test_ld1_x4(const mfloat8_t *ptr)
+{
+  return vld1_mf8_x4(ptr);
+}
+
+/*
+** test_ld1q_x4:	{ target lp64 }
+**	ld1	{v0.16b - v3.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16x4_t test_ld1q_x4(const mfloat8_t *ptr)
+{
+  return vld1q_mf8_x4(ptr);
+}
+
+/*
+** test_ld2:	{ target lp64 }
+**	ld2	{v0.8b( - |, )v1.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8x2_t test_ld2(const mfloat8_t *ptr)
+{
+  return vld2_mf8(ptr);
+}
+
+/*
+** test_ld2q:	{ target lp64 }
+**	ld2	{v0.16b( - |, )v1.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16x2_t test_ld2q(const mfloat8_t *ptr)
+{
+  return vld2q_mf8(ptr);
+}
+
+/*
+** test_ld2_dup:	{ target lp64 }
+**	ld2r	{v0.8b( - |, )v1.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8x2_t test_ld2_dup(const mfloat8_t *ptr)
+{
+  return vld2_dup_mf8(ptr);
+}
+
+/*
+** test_ld2q_dup:	{ target lp64 }
+**	ld2r	{v0.16b( - |, )v1.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16x2_t test_ld2q_dup(const mfloat8_t *ptr)
+{
+  return vld2q_dup_mf8(ptr);
+}
+
+/*
+** test_ld2_lane1:	{ target lp64 }
+**	ld2	{v0.b( - |, )v1.b}\[0\], \[x0\]
+**	ret
+*/
+mfloat8x8x2_t test_ld2_lane1(const mfloat8_t *ptr, mfloat8x8x2_t a)
+{
+  return vld2_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_ld2_lane2:	{ target lp64 }
+**	ld2	{v0.b( - |, )v1.b}\[7\], \[x0\]
+**	ret
+*/
+mfloat8x8x2_t test_ld2_lane2(const mfloat8_t *ptr, mfloat8x8x2_t a)
+{
+  return vld2_lane_mf8(ptr, a, 7);
+}
+
+/*
+** test_ld2q_lane1:	{ target lp64 }
+**	ld2	{v0.b( - |, )v1.b}\[0\], \[x0\]
+**	ret
+*/
+mfloat8x16x2_t test_ld2q_lane1(const mfloat8_t *ptr, mfloat8x16x2_t a)
+{
+  return vld2q_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_ld2q_lane2:	{ target lp64 }
+**	ld2	{v0.b( - |, )v1.b}\[15\], \[x0\]
+**	ret
+*/
+mfloat8x16x2_t test_ld2q_lane2(const mfloat8_t *ptr, mfloat8x16x2_t a)
+{
+  return vld2q_lane_mf8(ptr, a, 15);
+}
+
+/*
+** test_ld3:	{ target lp64 }
+**	ld3	{v0.8b - v2.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8x3_t test_ld3(const mfloat8_t *ptr)
+{
+  return vld3_mf8(ptr);
+}
+
+/*
+** test_ld3q:	{ target lp64 }
+**	ld3	{v0.16b - v2.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16x3_t test_ld3q(const mfloat8_t *ptr)
+{
+  return vld3q_mf8(ptr);
+}
+
+/*
+** test_ld3_dup:	{ target lp64 }
+**	ld3r	{v0.8b - v2.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8x3_t test_ld3_dup(const mfloat8_t *ptr)
+{
+  return vld3_dup_mf8(ptr);
+}
+
+/*
+** test_ld3q_dup:	{ target lp64 }
+**	ld3r	{v0.16b - v2.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16x3_t test_ld3q_dup(const mfloat8_t *ptr)
+{
+  return vld3q_dup_mf8(ptr);
+}
+
+/*
+** test_ld3_lane1:	{ target lp64 }
+**	ld3	{v0.b - v2.b}\[0\], \[x0\]
+**	ret
+*/
+mfloat8x8x3_t test_ld3_lane1(const mfloat8_t *ptr, mfloat8x8x3_t a)
+{
+  return vld3_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_ld3_lane2:	{ target lp64 }
+**	ld3	{v0.b - v2.b}\[7\], \[x0\]
+**	ret
+*/
+mfloat8x8x3_t test_ld3_lane2(const mfloat8_t *ptr, mfloat8x8x3_t a)
+{
+  return vld3_lane_mf8(ptr, a, 7);
+}
+
+/*
+** test_ld3q_lane1:	{ target lp64 }
+**	ld3	{v0.b - v2.b}\[0\], \[x0\]
+**	ret
+*/
+mfloat8x16x3_t test_ld3q_lane1(const mfloat8_t *ptr, mfloat8x16x3_t a)
+{
+  return vld3q_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_ld3q_lane2:	{ target lp64 }
+**	ld3	{v0.b - v2.b}\[15\], \[x0\]
+**	ret
+*/
+mfloat8x16x3_t test_ld3q_lane2(const mfloat8_t *ptr, mfloat8x16x3_t a)
+{
+  return vld3q_lane_mf8(ptr, a, 15);
+}
+
+/*
+** test_ld4:	{ target lp64 }
+**	ld4	{v0.8b - v3.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8x4_t test_ld4(const mfloat8_t *ptr)
+{
+  return vld4_mf8(ptr);
+}
+
+/*
+** test_ld4q:	{ target lp64 }
+**	ld4	{v0.16b - v3.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16x4_t test_ld4q(const mfloat8_t *ptr)
+{
+  return vld4q_mf8(ptr);
+}
+
+/*
+** test_ld4_dup:	{ target lp64 }
+**	ld4r	{v0.8b - v3.8b}, \[x0\]
+**	ret
+*/
+mfloat8x8x4_t test_ld4_dup(const mfloat8_t *ptr)
+{
+  return vld4_dup_mf8(ptr);
+}
+
+/*
+** test_ld4q_dup:	{ target lp64 }
+**	ld4r	{v0.16b - v3.16b}, \[x0\]
+**	ret
+*/
+mfloat8x16x4_t test_ld4q_dup(const mfloat8_t *ptr)
+{
+  return vld4q_dup_mf8(ptr);
+}
+
+/*
+** test_ld4_lane1:	{ target lp64 }
+**	ld4	{v0.b - v3.b}\[0\], \[x0\]
+**	ret
+*/
+mfloat8x8x4_t test_ld4_lane1(const mfloat8_t *ptr, mfloat8x8x4_t a)
+{
+  return vld4_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_ld4_lane2:	{ target lp64 }
+**	ld4	{v0.b - v3.b}\[7\], \[x0\]
+**	ret
+*/
+mfloat8x8x4_t test_ld4_lane2(const mfloat8_t *ptr, mfloat8x8x4_t a)
+{
+  return vld4_lane_mf8(ptr, a, 7);
+}
+
+/*
+** test_ld4q_lane1:	{ target lp64 }
+**	ld4	{v0.b - v3.b}\[0\], \[x0\]
+**	ret
+*/
+mfloat8x16x4_t test_ld4q_lane1(const mfloat8_t *ptr, mfloat8x16x4_t a)
+{
+  return vld4q_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_ld4q_lane2:	{ target lp64 }
+**	ld4	{v0.b - v3.b}\[15\], \[x0\]
+**	ret
+*/
+mfloat8x16x4_t test_ld4q_lane2(const mfloat8_t *ptr, mfloat8x16x4_t a)
+{
+  return vld4q_lane_mf8(ptr, a, 15);
+}
+
+/*
+** test_mov1:
+**	dup	v0.8b, v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_mov1(mfloat8_t a, mfloat8_t b)
+{
+  return vmov_n_mf8(b);
+}
+
+/*
+** test_mov2:
+**	movi	v0.2s, #?0
+**	ret
+*/
+mfloat8x8_t test_mov2()
+{
+  return vmov_n_mf8(((union { uint8_t x; mfloat8_t y; }) { 0 }).y);
+}
+
+/*
+** test_mov3:
+**	movi	v0.8b, #?0xf
+**	ret
+*/
+mfloat8x8_t test_mov3()
+{
+  return vmov_n_mf8(((union { uint8_t x; mfloat8_t y; }) { 0x0f }).y);
+}
+
+/*
+** test_movq1:
+**	dup	v0.16b, v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_movq1(mfloat8_t a, mfloat8_t b)
+{
+  return vmovq_n_mf8(b);
+}
+
+/*
+** test_movq2:
+**	movi	v0.4s, #?0
+**	ret
+*/
+mfloat8x16_t test_movq2()
+{
+  return vmovq_n_mf8(((union { uint8_t x; mfloat8_t y; }) { 0 }).y);
+}
+
+/*
+** test_movq3:
+**	movi	v0.16b, #?0xf
+**	ret
+*/
+mfloat8x16_t test_movq3()
+{
+  return vmovq_n_mf8(((union { uint8_t x; mfloat8_t y; }) { 0x0f }).y);
+}
+
+/*
+** test_rev16:
+**	rev16	v0.8b, v1.8b
+**	ret
+*/
+mfloat8x8_t test_rev16(mfloat8_t a, mfloat8x8_t b)
+{
+  return vrev16_mf8(b);
+}
+
+/*
+** test_rev16q:
+**	rev16	v0.16b, v1.16b
+**	ret
+*/
+mfloat8x16_t test_rev16q(mfloat8_t a, mfloat8x16_t b)
+{
+  return vrev16q_mf8(b);
+}
+
+/*
+** test_rev32:
+**	rev32	v0.8b, v1.8b
+**	ret
+*/
+mfloat8x8_t test_rev32(mfloat8_t a, mfloat8x8_t b)
+{
+  return vrev32_mf8(b);
+}
+
+/*
+** test_rev32q:
+**	rev32	v0.16b, v1.16b
+**	ret
+*/
+mfloat8x16_t test_rev32q(mfloat8_t a, mfloat8x16_t b)
+{
+  return vrev32q_mf8(b);
+}
+
+/*
+** test_rev64:
+**	rev64	v0.8b, v1.8b
+**	ret
+*/
+mfloat8x8_t test_rev64(mfloat8_t a, mfloat8x8_t b)
+{
+  return vrev64_mf8(b);
+}
+
+/*
+** test_rev64q:
+**	rev64	v0.16b, v1.16b
+**	ret
+*/
+mfloat8x16_t test_rev64q(mfloat8_t a, mfloat8x16_t b)
+{
+  return vrev64q_mf8(b);
+}
+
+/*
+** test_set_lane1:
+**	ins	v0.b\[0\], v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_set_lane1(mfloat8x8_t a, mfloat8_t b)
+{
+  return vset_lane_mf8(b, a, 0);
+}
+
+/*
+** test_set_lane2:
+**	ins	v0.b\[7\], v1.b\[0\]
+**	ret
+*/
+mfloat8x8_t test_set_lane2(mfloat8x8_t a, mfloat8_t b)
+{
+  return vset_lane_mf8(b, a, 7);
+}
+
+/*
+** test_set_lane3:	{ target lp64 }
+**	ld1	{v0.b}\[3\], \[x0\]
+**	ret
+*/
+mfloat8x8_t test_set_lane3(mfloat8x8_t a, const mfloat8_t *ptr)
+{
+  return vset_lane_mf8(*ptr, a, 3);
+}
+
+/*
+** test_set_lane4:
+**	ins	v0.b\[6\], wzr
+**	ret
+*/
+mfloat8x8_t test_set_lane4(mfloat8x8_t a)
+{
+  return vset_lane_mf8(((union { uint8_t x; mfloat8_t y; }) { 0 }).y, a, 6);
+}
+
+/*
+** test_setq_lane1:
+**	ins	v0.b\[0\], v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_setq_lane1(mfloat8x16_t a, mfloat8_t b)
+{
+  return vsetq_lane_mf8(b, a, 0);
+}
+
+/*
+** test_setq_lane2:
+**	ins	v0.b\[15\], v1.b\[0\]
+**	ret
+*/
+mfloat8x16_t test_setq_lane2(mfloat8x16_t a, mfloat8_t b)
+{
+  return vsetq_lane_mf8(b, a, 15);
+}
+
+/*
+** test_setq_lane3:	{ target lp64 }
+**	ld1	{v0.b}\[9\], \[x0\]
+**	ret
+*/
+mfloat8x16_t test_setq_lane3(mfloat8x16_t a, const mfloat8_t *ptr)
+{
+  return vsetq_lane_mf8(*ptr, a, 9);
+}
+
+/*
+** test_setq_lane4:
+**	ins	v0.b\[14\], wzr
+**	ret
+*/
+mfloat8x16_t test_setq_lane4(mfloat8x16_t a)
+{
+  return vsetq_lane_mf8(((union { uint8_t x; mfloat8_t y; }) { 0 }).y, a, 14);
+}
+
+/*
+** test_st1:	{ target { le && lp64 } }
+**	str	d0, \[x0\]
+**	ret
+*/
+/*
+** test_st1:	{ target { be && lp64 } }
+**	st1	{v0.8b}, \[x0\]
+**	ret
+*/
+void test_st1(mfloat8_t *ptr, mfloat8x8_t a)
+{
+  vst1_mf8(ptr, a);
+}
+
+/*
+** test_st1q:	{ target { le && lp64 } }
+**	str	q0, \[x0\]
+**	ret
+*/
+/*
+** test_st1q:	{ target { be && lp64 } }
+**	st1	{v0.16b}, \[x0\]
+**	ret
+*/
+void test_st1q(mfloat8_t *ptr, mfloat8x16_t a)
+{
+  vst1q_mf8(ptr, a);
+}
+
+/*
+** test_st1_lane1:	{ target lp64 }
+**	str	b0, \[x0\]
+**	ret
+*/
+void test_st1_lane1(mfloat8_t *ptr, mfloat8x8_t a)
+{
+  vst1_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_st1_lane2:	{ target lp64 }
+**	st1	{v0.b}\[7\], \[x0\]
+**	ret
+*/
+void test_st1_lane2(mfloat8_t *ptr, mfloat8x8_t a)
+{
+  vst1_lane_mf8(ptr, a, 7);
+}
+
+/*
+** test_st1q_lane1:	{ target lp64 }
+**	str	b0, \[x0\]
+**	ret
+*/
+void test_st1q_lane1(mfloat8_t *ptr, mfloat8x16_t a)
+{
+  vst1q_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_st1q_lane2:	{ target lp64 }
+**	st1	{v0.b}\[15\], \[x0\]
+**	ret
+*/
+void test_st1q_lane2(mfloat8_t *ptr, mfloat8x16_t a)
+{
+  vst1q_lane_mf8(ptr, a, 15);
+}
+
+/*
+** test_st1_x2:	{ target lp64 }
+**	st1	{v0.8b( - |, )v1.8b}, \[x0\]
+**	ret
+*/
+void test_st1_x2(mfloat8_t *ptr, mfloat8x8x2_t a)
+{
+  vst1_mf8_x2(ptr, a);
+}
+
+/*
+** test_st1q_x2:	{ target lp64 }
+**	st1	{v0.16b( - |, )v1.16b}, \[x0\]
+**	ret
+*/
+void test_st1q_x2(mfloat8_t *ptr, mfloat8x16x2_t a)
+{
+  vst1q_mf8_x2(ptr, a);
+}
+
+/*
+** test_st1_x3:	{ target lp64 }
+**	st1	{v0.8b - v2.8b}, \[x0\]
+**	ret
+*/
+void test_st1_x3(mfloat8_t *ptr, mfloat8x8x3_t a)
+{
+  vst1_mf8_x3(ptr, a);
+}
+
+/*
+** test_st1q_x3:	{ target lp64 }
+**	st1	{v0.16b - v2.16b}, \[x0\]
+**	ret
+*/
+void test_st1q_x3(mfloat8_t *ptr, mfloat8x16x3_t a)
+{
+  vst1q_mf8_x3(ptr, a);
+}
+
+/*
+** test_st1_x4:	{ target lp64 }
+**	st1	{v0.8b - v3.8b}, \[x0\]
+**	ret
+*/
+void test_st1_x4(mfloat8_t *ptr, mfloat8x8x4_t a)
+{
+  vst1_mf8_x4(ptr, a);
+}
+
+/*
+** test_st1q_x4:	{ target lp64 }
+**	st1	{v0.16b - v3.16b}, \[x0\]
+**	ret
+*/
+void test_st1q_x4(mfloat8_t *ptr, mfloat8x16x4_t a)
+{
+  vst1q_mf8_x4(ptr, a);
+}
+
+/*
+** test_st2:	{ target lp64 }
+**	st2	{v0.8b( - |, )v1.8b}, \[x0\]
+**	ret
+*/
+void test_st2(mfloat8_t *ptr, mfloat8x8x2_t a)
+{
+  vst2_mf8(ptr, a);
+}
+
+/*
+** test_st2q:	{ target lp64 }
+**	st2	{v0.16b( - |, )v1.16b}, \[x0\]
+**	ret
+*/
+void test_st2q(mfloat8_t *ptr, mfloat8x16x2_t a)
+{
+  vst2q_mf8(ptr, a);
+}
+
+/*
+** test_st2_lane1:	{ target lp64 }
+**	st2	{v0.b( - |, )v1.b}\[0\], \[x0\]
+**	ret
+*/
+void test_st2_lane1(mfloat8_t *ptr, mfloat8x8x2_t a)
+{
+  vst2_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_st2_lane2:	{ target lp64 }
+**	st2	{v0.b( - |, )v1.b}\[7\], \[x0\]
+**	ret
+*/
+void test_st2_lane2(mfloat8_t *ptr, mfloat8x8x2_t a)
+{
+  vst2_lane_mf8(ptr, a, 7);
+}
+
+/*
+** test_st2q_lane1:	{ target lp64 }
+**	st2	{v0.b( - |, )v1.b}\[0\], \[x0\]
+**	ret
+*/
+void test_st2q_lane1(mfloat8_t *ptr, mfloat8x16x2_t a)
+{
+  vst2q_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_st2q_lane2:	{ target lp64 }
+**	st2	{v0.b( - |, )v1.b}\[15\], \[x0\]
+**	ret
+*/
+void test_st2q_lane2(mfloat8_t *ptr, mfloat8x16x2_t a)
+{
+  vst2q_lane_mf8(ptr, a, 15);
+}
+
+/*
+** test_st3:	{ target lp64 }
+**	st3	{v0.8b - v2.8b}, \[x0\]
+**	ret
+*/
+void test_st3(mfloat8_t *ptr, mfloat8x8x3_t a)
+{
+  vst3_mf8(ptr, a);
+}
+
+/*
+** test_st3q:	{ target lp64 }
+**	st3	{v0.16b - v2.16b}, \[x0\]
+**	ret
+*/
+void test_st3q(mfloat8_t *ptr, mfloat8x16x3_t a)
+{
+  vst3q_mf8(ptr, a);
+}
+
+/*
+** test_st3_lane1:	{ target lp64 }
+**	st3	{v0.b - v2.b}\[0\], \[x0\]
+**	ret
+*/
+void test_st3_lane1(mfloat8_t *ptr, mfloat8x8x3_t a)
+{
+  vst3_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_st3_lane2:	{ target lp64 }
+**	st3	{v0.b - v2.b}\[7\], \[x0\]
+**	ret
+*/
+void test_st3_lane2(mfloat8_t *ptr, mfloat8x8x3_t a)
+{
+  vst3_lane_mf8(ptr, a, 7);
+}
+
+/*
+** test_st3q_lane1:	{ target lp64 }
+**	st3	{v0.b - v2.b}\[0\], \[x0\]
+**	ret
+*/
+void test_st3q_lane1(mfloat8_t *ptr, mfloat8x16x3_t a)
+{
+  vst3q_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_st3q_lane2:	{ target lp64 }
+**	st3	{v0.b - v2.b}\[15\], \[x0\]
+**	ret
+*/
+void test_st3q_lane2(mfloat8_t *ptr, mfloat8x16x3_t a)
+{
+  vst3q_lane_mf8(ptr, a, 15);
+}
+
+/*
+** test_st4:	{ target lp64 }
+**	st4	{v0.8b - v3.8b}, \[x0\]
+**	ret
+*/
+void test_st4(mfloat8_t *ptr, mfloat8x8x4_t a)
+{
+  vst4_mf8(ptr, a);
+}
+
+/*
+** test_st4q:	{ target lp64 }
+**	st4	{v0.16b - v3.16b}, \[x0\]
+**	ret
+*/
+void test_st4q(mfloat8_t *ptr, mfloat8x16x4_t a)
+{
+  vst4q_mf8(ptr, a);
+}
+
+/*
+** test_st4_lane1:	{ target lp64 }
+**	st4	{v0.b - v3.b}\[0\], \[x0\]
+**	ret
+*/
+void test_st4_lane1(mfloat8_t *ptr, mfloat8x8x4_t a)
+{
+  vst4_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_st4_lane2:	{ target lp64 }
+**	st4	{v0.b - v3.b}\[7\], \[x0\]
+**	ret
+*/
+void test_st4_lane2(mfloat8_t *ptr, mfloat8x8x4_t a)
+{
+  vst4_lane_mf8(ptr, a, 7);
+}
+
+/*
+** test_st4q_lane1:	{ target lp64 }
+**	st4	{v0.b - v3.b}\[0\], \[x0\]
+**	ret
+*/
+void test_st4q_lane1(mfloat8_t *ptr, mfloat8x16x4_t a)
+{
+  vst4q_lane_mf8(ptr, a, 0);
+}
+
+/*
+** test_st4q_lane2:	{ target lp64 }
+**	st4	{v0.b - v3.b}\[15\], \[x0\]
+**	ret
+*/
+void test_st4q_lane2(mfloat8_t *ptr, mfloat8x16x4_t a)
+{
+  vst4q_lane_mf8(ptr, a, 15);
+}
+
+/*
+** test_tbl1:
+**	fmov	d([0-9]+), d0
+**	tbl	v0.8b, {v\1.16b}, v1.8b
+**	ret
+*/
+mfloat8x8_t test_tbl1(mfloat8x8_t a, uint8x8_t b)
+{
+  return vtbl1_mf8(a, b);
+}
+
+/*
+** test_tbl2:
+**	uzp1	v([0-9]+).2d, v0.2d, v1.2d
+**	tbl	v0.8b, {v\1.16b}, v2.8b
+**	ret
+*/
+mfloat8x8_t test_tbl2(mfloat8x8x2_t a, uint8x8_t b)
+{
+  return vtbl2_mf8(a, b);
+}
+
+/*
+** test_tbl3:
+**	uzp1	v([0-9]+).2d, v0.2d, v1.2d
+**	fmov	d([0-9]+), d2
+**	tbl	v0.8b, {v\1.16b( - |, )v\2.16b}, v3.8b
+**	ret
+*/
+mfloat8x8_t test_tbl3(mfloat8x8x3_t a, uint8x8_t b)
+{
+  return vtbl3_mf8(a, b);
+}
+
+/*
+** test_tbl4:
+**	uzp1	v([0-9]+).2d, v0.2d, v1.2d
+**	uzp1	v([0-9]+).2d, v2.2d, v3.2d
+**	tbl	v0.8b, {v\1.16b( - |, )v\2.16b}, v4.8b
+**	ret
+*/
+mfloat8x8_t test_tbl4(mfloat8x8x4_t a, uint8x8_t b)
+{
+  return vtbl4_mf8(a, b);
+}
+
+/*
+** test_qtbl1:
+**	tbl	v0.8b, {v0.16b}, v1.8b
+**	ret
+*/
+mfloat8x8_t test_qtbl1(mfloat8x16_t a, uint8x8_t b)
+{
+  return vqtbl1_mf8(a, b);
+}
+
+/*
+** test_qtbl1q:
+**	tbl	v0.16b, {v0.16b}, v1.16b
+**	ret
+*/
+mfloat8x16_t test_qtbl1q(mfloat8x16_t a, uint8x16_t b)
+{
+  return vqtbl1q_mf8(a, b);
+}
+
+/*
+** test_qtbl2:
+**	tbl	v0.8b, {v0.16b( - |, )v1.16b}, v2.8b
+**	ret
+*/
+mfloat8x8_t test_qtbl2(mfloat8x16x2_t a, uint8x8_t b)
+{
+  return vqtbl2_mf8(a, b);
+}
+
+/*
+** test_qtbl2q:
+**	tbl	v0.16b, {v0.16b( - |, )v1.16b}, v2.16b
+**	ret
+*/
+mfloat8x16_t test_qtbl2q(mfloat8x16x2_t a, uint8x16_t b)
+{
+  return vqtbl2q_mf8(a, b);
+}
+
+/*
+** test_qtbl3:
+**	tbl	v0.8b, {v0.16b - v2.16b}, v3.8b
+**	ret
+*/
+mfloat8x8_t test_qtbl3(mfloat8x16x3_t a, uint8x8_t b)
+{
+  return vqtbl3_mf8(a, b);
+}
+
+/*
+** test_qtbl3q:
+**	tbl	v0.16b, {v0.16b - v2.16b}, v3.16b
+**	ret
+*/
+mfloat8x16_t test_qtbl3q(mfloat8x16x3_t a, uint8x16_t b)
+{
+  return vqtbl3q_mf8(a, b);
+}
+
+/*
+** test_qtbl4:
+**	tbl	v0.8b, {v0.16b - v3.16b}, v4.8b
+**	ret
+*/
+mfloat8x8_t test_qtbl4(mfloat8x16x4_t a, uint8x8_t b)
+{
+  return vqtbl4_mf8(a, b);
+}
+
+/*
+** test_qtbl4q:
+**	tbl	v0.16b, {v0.16b - v3.16b}, v4.16b
+**	ret
+*/
+mfloat8x16_t test_qtbl4q(mfloat8x16x4_t a, uint8x16_t b)
+{
+  return vqtbl4q_mf8(a, b);
+}
+
+/*
+** test_tbx1:
+**	fmov	d([0-9]+), d1
+**	tbl	v[0-9]+.8b, {v\1.16b}, v2.8b
+**	...
+**	cmh[is]	[^\n]+
+**	(bit|bif|bsl)	[^\n]+
+**	ret
+*/
+mfloat8x8_t test_tbx1(mfloat8x8_t a, mfloat8x8_t b, uint8x8_t c)
+{
+  return vtbx1_mf8(a, b, c);
+}
+
+/*
+** test_tbx2:
+**	uzp1	v([0-9]+).2d, v1.2d, v2.2d
+**	tbx	v[0-9]+.8b, {v\1.16b}, v3.8b
+**	ret
+*/
+mfloat8x8_t test_tbx2(mfloat8x8_t a, mfloat8x8x2_t b, uint8x8_t c)
+{
+  return vtbx2_mf8(a, b, c);
+}
+
+/*
+** test_tbx3:
+**	uzp1	v([0-9]+).2d, v1.2d, v2.2d
+**	fmov	d([0-9]+), d3
+**	tbl	v[0-9]+.8b, {v\1.16b( - |, )v\2.16b}, v4.8b
+**	...
+**	cmh[is]	[^\n]+
+**	(bit|bif|bsl)	[^\n]+
+**	ret
+*/
+mfloat8x8_t test_tbx3(mfloat8x8_t a, mfloat8x8x3_t b, uint8x8_t c)
+{
+  return vtbx3_mf8(a, b, c);
+}
+
+/*
+** test_tbx4:
+**	uzp1	v([0-9]+).2d, v1.2d, v2.2d
+**	uzp1	v([0-9]+).2d, v3.2d, v4.2d
+**	tbx	v0.8b, {v\1.16b( - |, )v\2.16b}, v5.8b
+**	ret
+*/
+mfloat8x8_t test_tbx4(mfloat8x8_t a, mfloat8x8x4_t b, uint8x8_t c)
+{
+  return vtbx4_mf8(a, b, c);
+}
+
+/*
+** test_qtbx1:
+**	tbx	v0.8b, {v1.16b}, v2.8b
+**	ret
+*/
+mfloat8x8_t test_qtbx1(mfloat8x8_t a, mfloat8x16_t b, uint8x8_t c)
+{
+  return vqtbx1_mf8(a, b, c);
+}
+
+/*
+** test_qtbx1q:
+**	tbx	v0.16b, {v1.16b}, v2.16b
+**	ret
+*/
+mfloat8x16_t test_qtbx1q(mfloat8x16_t a, mfloat8x16_t b, uint8x16_t c)
+{
+  return vqtbx1q_mf8(a, b, c);
+}
+
+/*
+** test_qtbx2:
+**	tbx	v0.8b, {v1.16b( - |, )v2.16b}, v3.8b
+**	ret
+*/
+mfloat8x8_t test_qtbx2(mfloat8x8_t a, mfloat8x16x2_t b, uint8x8_t c)
+{
+  return vqtbx2_mf8(a, b, c);
+}
+
+/*
+** test_qtbx2q:
+**	tbx	v0.16b, {v1.16b( - |, )v2.16b}, v3.16b
+**	ret
+*/
+mfloat8x16_t test_qtbx2q(mfloat8x16_t a, mfloat8x16x2_t b, uint8x16_t c)
+{
+  return vqtbx2q_mf8(a, b, c);
+}
+
+/*
+** test_qtbx3:
+**	tbx	v0.8b, {v1.16b - v3.16b}, v4.8b
+**	ret
+*/
+mfloat8x8_t test_qtbx3(mfloat8x8_t a, mfloat8x16x3_t b, uint8x8_t c)
+{
+  return vqtbx3_mf8(a, b, c);
+}
+
+/*
+** test_qtbx3q:
+**	tbx	v0.16b, {v1.16b - v3.16b}, v4.16b
+**	ret
+*/
+mfloat8x16_t test_qtbx3q(mfloat8x16_t a, mfloat8x16x3_t b, uint8x16_t c)
+{
+  return vqtbx3q_mf8(a, b, c);
+}
+
+/*
+** test_qtbx4:
+**	tbx	v0.8b, {v1.16b - v4.16b}, v5.8b
+**	ret
+*/
+mfloat8x8_t test_qtbx4(mfloat8x8_t a, mfloat8x16x4_t b, uint8x8_t c)
+{
+  return vqtbx4_mf8(a, b, c);
+}
+
+/*
+** test_qtbx4q:
+**	tbx	v0.16b, {v1.16b - v4.16b}, v5.16b
+**	ret
+*/
+mfloat8x16_t test_qtbx4q(mfloat8x16_t a, mfloat8x16x4_t b, uint8x16_t c)
+{
+  return vqtbx4q_mf8(a, b, c);
+}
+
+/*
+** test_trn:
+**	trn1	v0.8b, v2.8b, v3.8b
+**	trn2	v1.8b, v2.8b, v3.8b
+**	ret
+*/
+mfloat8x8x2_t test_trn(mfloat8_t a, mfloat8_t b, mfloat8x8_t c, mfloat8x8_t d)
+{
+  return vtrn_mf8(c, d);
+}
+
+/*
+** test_trnq:
+**	trn1	v0.16b, v2.16b, v3.16b
+**	trn2	v1.16b, v2.16b, v3.16b
+**	ret
+*/
+mfloat8x16x2_t test_trnq(mfloat8_t a, mfloat8_t b,
+			 mfloat8x16_t c, mfloat8x16_t d)
+{
+  return vtrnq_mf8(c, d);
+}
+
+/*
+** test_trn1:
+**	trn1	v0.8b, v1.8b, v2.8b
+**	ret
+*/
+mfloat8x8_t test_trn1(mfloat8_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vtrn1_mf8(b, c);
+}
+
+/*
+** test_trn1q:
+**	trn1	v0.16b, v1.16b, v2.16b
+**	ret
+*/
+mfloat8x16_t test_trn1q(mfloat8_t a, mfloat8x16_t b, mfloat8x16_t c)
+{
+  return vtrn1q_mf8(b, c);
+}
+
+/*
+** test_trn2:
+**	trn2	v0.8b, v1.8b, v2.8b
+**	ret
+*/
+mfloat8x8_t test_trn2(mfloat8_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vtrn2_mf8(b, c);
+}
+
+/*
+** test_trn2q:
+**	trn2	v0.16b, v1.16b, v2.16b
+**	ret
+*/
+mfloat8x16_t test_trn2q(mfloat8_t a, mfloat8x16_t b, mfloat8x16_t c)
+{
+  return vtrn2q_mf8(b, c);
+}
+
+/*
+** test_uzp:
+**	uzp1	v0.8b, v2.8b, v3.8b
+**	uzp2	v1.8b, v2.8b, v3.8b
+**	ret
+*/
+mfloat8x8x2_t test_uzp(mfloat8_t a, mfloat8_t b, mfloat8x8_t c, mfloat8x8_t d)
+{
+  return vuzp_mf8(c, d);
+}
+
+/*
+** test_uzpq:
+**	uzp1	v0.16b, v2.16b, v3.16b
+**	uzp2	v1.16b, v2.16b, v3.16b
+**	ret
+*/
+mfloat8x16x2_t test_uzpq(mfloat8_t a, mfloat8_t b,
+			 mfloat8x16_t c, mfloat8x16_t d)
+{
+  return vuzpq_mf8(c, d);
+}
+
+/*
+** test_uzp1:
+**	uzp1	v0.8b, v1.8b, v2.8b
+**	ret
+*/
+mfloat8x8_t test_uzp1(mfloat8_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vuzp1_mf8(b, c);
+}
+
+/*
+** test_uzp1q:
+**	uzp1	v0.16b, v1.16b, v2.16b
+**	ret
+*/
+mfloat8x16_t test_uzp1q(mfloat8_t a, mfloat8x16_t b, mfloat8x16_t c)
+{
+  return vuzp1q_mf8(b, c);
+}
+
+/*
+** test_uzp2:
+**	uzp2	v0.8b, v1.8b, v2.8b
+**	ret
+*/
+mfloat8x8_t test_uzp2(mfloat8_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vuzp2_mf8(b, c);
+}
+
+/*
+** test_uzp2q:
+**	uzp2	v0.16b, v1.16b, v2.16b
+**	ret
+*/
+mfloat8x16_t test_uzp2q(mfloat8_t a, mfloat8x16_t b, mfloat8x16_t c)
+{
+  return vuzp2q_mf8(b, c);
+}
+
+/*
+** test_zip:
+**	zip1	v0.8b, v2.8b, v3.8b
+**	zip2	v1.8b, v2.8b, v3.8b
+**	ret
+*/
+mfloat8x8x2_t test_zip(mfloat8_t a, mfloat8_t b, mfloat8x8_t c, mfloat8x8_t d)
+{
+  return vzip_mf8(c, d);
+}
+
+/*
+** test_zipq:
+**	zip1	v0.16b, v2.16b, v3.16b
+**	zip2	v1.16b, v2.16b, v3.16b
+**	ret
+*/
+mfloat8x16x2_t test_zipq(mfloat8_t a, mfloat8_t b,
+			 mfloat8x16_t c, mfloat8x16_t d)
+{
+  return vzipq_mf8(c, d);
+}
+
+/*
+** test_zip1:
+**	zip1	v0.8b, v1.8b, v2.8b
+**	ret
+*/
+mfloat8x8_t test_zip1(mfloat8_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vzip1_mf8(b, c);
+}
+
+/*
+** test_zip1q:
+**	zip1	v0.16b, v1.16b, v2.16b
+**	ret
+*/
+mfloat8x16_t test_zip1q(mfloat8_t a, mfloat8x16_t b, mfloat8x16_t c)
+{
+  return vzip1q_mf8(b, c);
+}
+
+/*
+** test_zip2:
+**	zip2	v0.8b, v1.8b, v2.8b
+**	ret
+*/
+mfloat8x8_t test_zip2(mfloat8_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vzip2_mf8(b, c);
+}
+
+/*
+** test_zip2q:
+**	zip2	v0.16b, v1.16b, v2.16b
+**	ret
+*/
+mfloat8x16_t test_zip2q(mfloat8_t a, mfloat8x16_t b, mfloat8x16_t c)
+{
+  return vzip2q_mf8(b, c);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_2.c b/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_2.c
new file mode 100644
index 0000000..0f923f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_2.c
@@ -0,0 +1,98 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-O -std=gnu23 --save-temps" } */
+
+#include <arm_neon.h>
+
+void test(mfloat8x8_t x8, mfloat8x16_t x16,
+	  mfloat8x8x2_t x8x2, mfloat8x16x2_t x16x2,
+	  mfloat8x8x3_t x8x3, mfloat8x16x3_t x16x3,
+	  mfloat8x8x4_t x8x4, mfloat8x16x4_t x16x4,
+	  mfloat8_t *ptr, mfloat8_t scalar)
+{
+  vcopy_lane_mf8(x8, -1, x8, 0); /* { dg-error {passing -1 to argument 2 of 'vcopy_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vcopy_lane_mf8(x8, 8, x8, 0); /* { dg-error {passing 8 to argument 2 of 'vcopy_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vcopy_lane_mf8(x8, 0, x8, -1); /* { dg-error {passing -1 to argument 4 of 'vcopy_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vcopy_lane_mf8(x8, 0, x8, 8); /* { dg-error {passing 8 to argument 4 of 'vcopy_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vcopy_lane_mf8(x8, 100, x8, 100); /* { dg-error {passing 100 to argument 2 of 'vcopy_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  /* { dg-error {passing 100 to argument 4 of 'vcopy_lane_mf8', which expects a value in the range \[0, 7\]} "" { target *-*-* } .-1 } */
+
+  vcopy_laneq_mf8(x8, -1, x16, 0); /* { dg-error {passing -1 to argument 2 of 'vcopy_laneq_mf8', which expects a value in the range \[0, 7\]} } */
+  vcopy_laneq_mf8(x8, 8, x16, 0); /* { dg-error {passing 8 to argument 2 of 'vcopy_laneq_mf8', which expects a value in the range \[0, 7\]} } */
+  vcopy_laneq_mf8(x8, 0, x16, -1); /* { dg-error {passing -1 to argument 4 of 'vcopy_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+  vcopy_laneq_mf8(x8, 0, x16, 16); /* { dg-error {passing 16 to argument 4 of 'vcopy_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vcopyq_lane_mf8(x16, -1, x8, 0); /* { dg-error {passing -1 to argument 2 of 'vcopyq_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vcopyq_lane_mf8(x16, 16, x8, 0); /* { dg-error {passing 16 to argument 2 of 'vcopyq_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vcopyq_lane_mf8(x16, 0, x8, -1); /* { dg-error {passing -1 to argument 4 of 'vcopyq_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vcopyq_lane_mf8(x16, 0, x8, 8); /* { dg-error {passing 8 to argument 4 of 'vcopyq_lane_mf8', which expects a value in the range \[0, 7\]} } */
+
+  vcopyq_laneq_mf8(x16, -1, x16, 0); /* { dg-error {passing -1 to argument 2 of 'vcopyq_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+  vcopyq_laneq_mf8(x16, 16, x16, 0); /* { dg-error {passing 16 to argument 2 of 'vcopyq_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+  vcopyq_laneq_mf8(x16, 0, x16, -1); /* { dg-error {passing -1 to argument 4 of 'vcopyq_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+  vcopyq_laneq_mf8(x16, 0, x16, 16); /* { dg-error {passing 16 to argument 4 of 'vcopyq_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vdup_lane_mf8(x8, -1); /* { dg-error {passing -1 to argument 2 of 'vdup_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vdup_lane_mf8(x8, 8); /* { dg-error {passing 8 to argument 2 of 'vdup_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vdup_laneq_mf8(x16, -1); /* { dg-error {passing -1 to argument 2 of 'vdup_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+  vdup_laneq_mf8(x16, 16); /* { dg-error {passing 16 to argument 2 of 'vdup_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vdupq_lane_mf8(x8, -1); /* { dg-error {passing -1 to argument 2 of 'vdupq_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vdupq_lane_mf8(x8, 8); /* { dg-error {passing 8 to argument 2 of 'vdupq_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vdupq_laneq_mf8(x16, -1); /* { dg-error {passing -1 to argument 2 of 'vdupq_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+  vdupq_laneq_mf8(x16, 16); /* { dg-error {passing 16 to argument 2 of 'vdupq_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vdupb_lane_mf8(x8, -1); /* { dg-error {passing -1 to argument 2 of 'vdupb_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vdupb_lane_mf8(x8, 8); /* { dg-error {passing 8 to argument 2 of 'vdupb_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vdupb_laneq_mf8(x16, -1); /* { dg-error {passing -1 to argument 2 of 'vdupb_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+  vdupb_laneq_mf8(x16, 16); /* { dg-error {passing 16 to argument 2 of 'vdupb_laneq_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vext_mf8(x8, x8, -1); /* { dg-error {passing -1 to argument 3 of 'vext_mf8', which expects a value in the range \[0, 7\]} } */
+  vext_mf8(x8, x8, 8); /* { dg-error {passing 8 to argument 3 of 'vext_mf8', which expects a value in the range \[0, 7\]} } */
+  vextq_mf8(x16, x16, -1); /* { dg-error {passing -1 to argument 3 of 'vextq_mf8', which expects a value in the range \[0, 15\]} } */
+  vextq_mf8(x16, x16, 16); /* { dg-error {passing 16 to argument 3 of 'vextq_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vld1_lane_mf8(ptr, x8, -1); /* { dg-error {passing -1 to argument 3 of 'vld1_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vld1_lane_mf8(ptr, x8, 8); /* { dg-error {passing 8 to argument 3 of 'vld1_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vld1q_lane_mf8(ptr, x16, -1); /* { dg-error {passing -1 to argument 3 of 'vld1q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vld1q_lane_mf8(ptr, x16, 16); /* { dg-error {passing 16 to argument 3 of 'vld1q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vld2_lane_mf8(ptr, x8x2, -1); /* { dg-error {passing -1 to argument 3 of 'vld2_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vld2_lane_mf8(ptr, x8x2, 8); /* { dg-error {passing 8 to argument 3 of 'vld2_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vld2q_lane_mf8(ptr, x16x2, -1); /* { dg-error {passing -1 to argument 3 of 'vld2q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vld2q_lane_mf8(ptr, x16x2, 16); /* { dg-error {passing 16 to argument 3 of 'vld2q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vld3_lane_mf8(ptr, x8x3, -1); /* { dg-error {passing -1 to argument 3 of 'vld3_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vld3_lane_mf8(ptr, x8x3, 8); /* { dg-error {passing 8 to argument 3 of 'vld3_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vld3q_lane_mf8(ptr, x16x3, -1); /* { dg-error {passing -1 to argument 3 of 'vld3q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vld3q_lane_mf8(ptr, x16x3, 16); /* { dg-error {passing 16 to argument 3 of 'vld3q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vld4_lane_mf8(ptr, x8x4, -1); /* { dg-error {passing -1 to argument 3 of 'vld4_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vld4_lane_mf8(ptr, x8x4, 8); /* { dg-error {passing 8 to argument 3 of 'vld4_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vld4q_lane_mf8(ptr, x16x4, -1); /* { dg-error {passing -1 to argument 3 of 'vld4q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vld4q_lane_mf8(ptr, x16x4, 16); /* { dg-error {passing 16 to argument 3 of 'vld4q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vset_lane_mf8(scalar, x8, -1); /* { dg-error {passing -1 to argument 3 of 'vset_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vset_lane_mf8(scalar, x8, 8); /* { dg-error {passing 8 to argument 3 of 'vset_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vsetq_lane_mf8(scalar, x16, -1); /* { dg-error {passing -1 to argument 3 of 'vsetq_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vsetq_lane_mf8(scalar, x16, 16); /* { dg-error {passing 16 to argument 3 of 'vsetq_lane_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vst1_lane_mf8(ptr, x8, -1); /* { dg-error {passing -1 to argument 3 of 'vst1_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vst1_lane_mf8(ptr, x8, 8); /* { dg-error {passing 8 to argument 3 of 'vst1_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vst1q_lane_mf8(ptr, x16, -1); /* { dg-error {passing -1 to argument 3 of 'vst1q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vst1q_lane_mf8(ptr, x16, 16); /* { dg-error {passing 16 to argument 3 of 'vst1q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vst2_lane_mf8(ptr, x8x2, -1); /* { dg-error {passing -1 to argument 3 of 'vst2_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vst2_lane_mf8(ptr, x8x2, 8); /* { dg-error {passing 8 to argument 3 of 'vst2_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vst2q_lane_mf8(ptr, x16x2, -1); /* { dg-error {passing -1 to argument 3 of 'vst2q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vst2q_lane_mf8(ptr, x16x2, 16); /* { dg-error {passing 16 to argument 3 of 'vst2q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vst3_lane_mf8(ptr, x8x3, -1); /* { dg-error {passing -1 to argument 3 of 'vst3_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vst3_lane_mf8(ptr, x8x3, 8); /* { dg-error {passing 8 to argument 3 of 'vst3_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vst3q_lane_mf8(ptr, x16x3, -1); /* { dg-error {passing -1 to argument 3 of 'vst3q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vst3q_lane_mf8(ptr, x16x3, 16); /* { dg-error {passing 16 to argument 3 of 'vst3q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+
+  vst4_lane_mf8(ptr, x8x4, -1); /* { dg-error {passing -1 to argument 3 of 'vst4_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vst4_lane_mf8(ptr, x8x4, 8); /* { dg-error {passing 8 to argument 3 of 'vst4_lane_mf8', which expects a value in the range \[0, 7\]} } */
+  vst4q_lane_mf8(ptr, x16x4, -1); /* { dg-error {passing -1 to argument 3 of 'vst4q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+  vst4q_lane_mf8(ptr, x16x4, 16); /* { dg-error {passing 16 to argument 3 of 'vst4q_lane_mf8', which expects a value in the range \[0, 15\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_1.c
index 596b9ee..2db3c3c 100644
--- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_1.c
@@ -6,6 +6,92 @@
 
 extern void abort (void);
 
+mfloat8x8_t __attribute__ ((noinline))
+wrap_vdup_lane_mf8_0 (mfloat8x8_t a)
+{
+  return vdup_lane_mf8 (a, 0);
+}
+
+mfloat8x8_t __attribute__ ((noinline))
+wrap_vdup_lane_mf8_1 (mfloat8x8_t a)
+{
+  return vdup_lane_mf8 (a, 1);
+}
+
+int __attribute__ ((noinline))
+test_vdup_lane_mf8 ()
+{
+  mfloat8_t m;
+  uint8_t n = 11;
+  mfloat8x8_t a;
+  mfloat8x8_t b;
+  int i;
+  /* Only two first cases are interesting.  */
+  mfloat8_t c[8];
+  mfloat8_t d[8];
+
+  __builtin_memcpy(&m, &n, 1);
+  b = vdup_n_mf8 (m);
+  vst1_mf8 (d, b);
+
+  a = vld1_mf8 (c);
+  b = wrap_vdup_lane_mf8_0 (a);
+  vst1_mf8 (d, b);
+  for (i = 0; i < 8; i++)
+    if (__builtin_memcmp (&c[0], &d[i], 1) != 0)
+      return 1;
+
+  b = wrap_vdup_lane_mf8_1 (a);
+  vst1_mf8 (d, b);
+  for (i = 0; i < 8; i++)
+    if (__builtin_memcmp (&c[1], &d[i], 1) != 0)
+      return 1;
+  return 0;
+}
+
+mfloat8x16_t __attribute__ ((noinline))
+wrap_vdupq_lane_mf8_0 (mfloat8x8_t a)
+{
+  return vdupq_lane_mf8 (a, 0);
+}
+
+mfloat8x16_t __attribute__ ((noinline))
+wrap_vdupq_lane_mf8_1 (mfloat8x8_t a)
+{
+  return vdupq_lane_mf8 (a, 1);
+}
+
+int __attribute__ ((noinline))
+test_vdupq_lane_mf8 ()
+{
+  mfloat8_t m;
+  uint8_t n = 11;
+  mfloat8x8_t a;
+  mfloat8x16_t b;
+  int i;
+  /* Only two first cases are interesting.  */
+  mfloat8_t c[8];
+  mfloat8_t d[16];
+
+  __builtin_memcpy(&m, &n, 1);
+  b = vdupq_n_mf8 (m);
+  vst1q_mf8 (d, b);
+
+  a = vld1_mf8 (c);
+  b = wrap_vdupq_lane_mf8_0 (a);
+  vst1q_mf8 (d, b);
+  for (i = 0; i < 16; i++)
+    if (__builtin_memcmp (&c[0], &d[i], 1) != 0)
+      return 1;
+
+  b = wrap_vdupq_lane_mf8_1 (a);
+  vst1q_mf8 (d, b);
+  for (i = 0; i < 16; i++)
+    if (__builtin_memcmp (&c[1], &d[i], 1) != 0)
+      return 1;
+  return 0;
+}
+
 float32x2_t __attribute__ ((noinline))
 wrap_vdup_lane_f32_0 (float32x2_t a)
 {
@@ -350,7 +436,10 @@ test_vdupq_lane_s64 ()
 int
 main ()
 {
-
+  if (test_vdup_lane_mf8 ())
+    abort ();
+  if (test_vdupq_lane_mf8 ())
+    abort ();
   if (test_vdup_lane_f32 ())
     abort ();
   if (test_vdup_lane_s8 ())
@@ -376,12 +465,12 @@ main ()
 }
 
 /* Asm check for test_vdup_lane_s8.  */
-/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8b, v\[0-9\]+\.b\\\[0\\\]" 1 } } */
-/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8b, v\[0-9\]+\.b\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8b, v\[0-9\]+\.b\\\[0\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8b, v\[0-9\]+\.b\\\[1\\\]" 2 } } */
 
 /* Asm check for test_vdupq_lane_s8.  */
-/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.16b, v\[0-9\]+\.b\\\[0\\\]" 1 } } */
-/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.16b, v\[0-9\]+\.b\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.16b, v\[0-9\]+\.b\\\[0\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.16b, v\[0-9\]+\.b\\\[1\\\]" 2 } } */
 
 /* Asm check for test_vdup_lane_s16.  */
 /* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.4h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
index 16f4808..e9b4cdd 100644
--- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
@@ -11,6 +11,45 @@
 
 extern void abort (void);
 
+mfloat8_t __attribute__ ((noinline))
+wrap_vdupb_lane_mf8_0 (mfloat8x8_t dummy, mfloat8x8_t a)
+{
+  mfloat8_t result = vdupb_lane_mf8 (a, 0);
+  force_simd (result);
+  return result;
+}
+
+mfloat8_t __attribute__ ((noinline))
+wrap_vdupb_lane_mf8_1 (mfloat8x8_t a)
+{
+  mfloat8_t result = vdupb_lane_mf8 (a, 1);
+  force_simd (result);
+  return result;
+}
+
+int __attribute__ ((noinline))
+test_vdupb_lane_mf8 ()
+{
+  mfloat8_t m;
+  uint8_t n = 11;
+  mfloat8x8_t a;
+  mfloat8_t b;
+  mfloat8_t c[8];
+
+  __builtin_memcpy(&m, &n, 1);
+  a = vdup_n_mf8 (m);
+  vst1_mf8 (c, a);
+
+  b = wrap_vdupb_lane_mf8_0 (a, a);
+  if (__builtin_memcmp (&c[0], &b, 1) != 0)
+    return 1;
+  b = wrap_vdupb_lane_mf8_1 (a);
+  if (__builtin_memcmp (&c[1], &b, 1) != 0)
+    return 1;
+
+  return 0;
+}
+
 float32_t __attribute__ ((noinline))
 wrap_vdups_lane_f32_0 (float32x2_t dummy, float32x2_t a)
 {
@@ -300,6 +339,8 @@ test_vdupd_lane_s64 ()
 int
 main ()
 {
+  if (test_vdupb_lane_mf8 ())
+    abort ();
   if (test_vdups_lane_f32 ())
     abort ();
   if (test_vdupd_lane_f64 ())
@@ -323,9 +364,9 @@ main ()
   return 0;
 }
 
-/* Asm check for vdupb_lane_s8, vdupb_lane_u8.  */
+/* Asm check for vdupb_lane_s8, vdupb_lane_u8, and vdupb_lane_mf8.  */
 /* { dg-final { scan-assembler-not "dup\\tb\[0-9\]+, v\[0-9\]+\.b\\\[0\\\]" } } */
-/* { dg-final { scan-assembler-times "dup\\tb\[0-9\]+, v\[0-9\]+\.b\\\[1\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "dup\\tb\[0-9\]+, v\[0-9\]+\.b\\\[1\\\]" 3 } } */
 
 /* Asm check for vduph_lane_h16, vduph_lane_h16.  */
 /* { dg-final { scan-assembler-not "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[0\\\]" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_n_1.c b/gcc/testsuite/gcc.target/aarch64/vdup_n_1.c
index 99ac887..bac061b 100644
--- a/gcc/testsuite/gcc.target/aarch64/vdup_n_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vdup_n_1.c
@@ -6,6 +6,48 @@
 
 extern void abort (void);
 
+mfloat8x8_t __attribute__ ((noinline))
+wrap_vdup_n_mf8 (mfloat8_t a)
+{
+  return vdup_n_mf8 (a);
+}
+
+int __attribute__ ((noinline))
+test_vdup_n_mf8 (mfloat8_t a)
+{
+  mfloat8x8_t b;
+  mfloat8_t c[8];
+  int i;
+
+  b = wrap_vdup_n_mf8 (a);
+  vst1_mf8 (c, b);
+  for (i = 0; i < 8; i++)
+    if (__builtin_memcmp (&a, &c[i], 1) != 0)
+      return 1;
+  return 0;
+}
+
+mfloat8x16_t __attribute__ ((noinline))
+wrap_vdupq_n_mf8 (mfloat8_t a)
+{
+  return vdupq_n_mf8 (a);
+}
+
+int __attribute__ ((noinline))
+test_vdupq_n_mf8 (mfloat8_t a)
+{
+  mfloat8x16_t b;
+  mfloat8_t c[16];
+  int i;
+
+  b = wrap_vdupq_n_mf8 (a);
+  vst1q_mf8 (c, b);
+  for (i = 0; i < 16; i++)
+    if (__builtin_memcmp (&a, &c[i], 1) != 0)
+      return 1;
+  return 0;
+}
+
 float32x2_t __attribute__ ((noinline))
 wrap_vdup_n_f32 (float32_t a)
 {
@@ -537,6 +579,16 @@ test_vdupq_n_u64 ()
 int
 main ()
 {
+  mfloat8_t a, c;
+  uint8_t b = 11;
+  uint8_t d = 12;
+  __builtin_memcpy(&a, &b, 1);
+  __builtin_memcpy(&c, &d, 1);
+
+  if (test_vdup_n_mf8(a))
+    abort ();
+  if (test_vdupq_n_mf8(c))
+    abort ();
   if (test_vdup_n_f32 ())
     abort ();
   if (test_vdup_n_f64 ())
@@ -591,12 +643,16 @@ main ()
 /* No asm checks for vdup_n_f32, vdupq_n_f32, vdup_n_f64 and vdupq_n_f64.
    Cannot force floating point value in general purpose regester.  */
 
-/* Asm check for test_vdup_n_p8, test_vdup_n_s8, test_vdup_n_u8.  */
-/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8b, w\[0-9\]+" 3 } } */
+/* Asm check for test_vdup_n_mf8, test_vdup_n_p8, test_vdup_n_s8,
+   test_vdup_n_u8.  */
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8b, w\[0-9\]+" 5 } } */
 
 /* Asm check for test_vdupq_n_p8, test_vdupq_n_s8, test_vdupq_n_u8.  */
 /* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.16b, w\[0-9\]+" 3 } } */
 
+/* Asm check for test_vdupq_n_mf8.  */
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.16b, v\[0-9\]+\.b\\\[0\\\]" 1 } } */
+
 /* Asm check for test_vdup_n_p16, test_vdup_n_s16, test_vdup_n_u16.  */
 /* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.4h, w\[0-9\]+" 3 } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
index 811dc67..6053dfa 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
@@ -14,7 +14,8 @@ test_copy##Q1##_lane##Q2##_##SUFFIX (TYPE1 a, TYPE2 b)			\
 BUILD_TEST (poly8x8_t, poly8x8_t, , , p8, 7, 6)
 BUILD_TEST (int8x8_t,  int8x8_t,  , , s8, 7, 6)
 BUILD_TEST (uint8x8_t, uint8x8_t, , , u8, 7, 6)
-/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[6\\\]" 3 } } */
+BUILD_TEST (mfloat8x8_t, mfloat8x8_t, , , mf8, 7, 6)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[6\\\]" 4 } } */
 BUILD_TEST (poly16x4_t, poly16x4_t, , , p16, 3, 2)
 BUILD_TEST (int16x4_t,  int16x4_t,  , , s16, 3, 2)
 BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
@@ -33,7 +34,8 @@ BUILD_TEST (float64x1_t, float64x1_t, , , f64, 0, 0)
 BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15)
 BUILD_TEST (int8x8_t,  int8x16_t,  , q, s8, 7, 15)
 BUILD_TEST (uint8x8_t, uint8x16_t, , q, u8, 7, 15)
-/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[15\\\]" 3 } } */
+BUILD_TEST (mfloat8x8_t, mfloat8x16_t, , q, mf8, 7, 15)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[15\\\]" 4 } } */
 BUILD_TEST (poly16x4_t, poly16x8_t, , q, p16, 3, 7)
 BUILD_TEST (int16x4_t,  int16x8_t,  , q, s16, 3, 7)
 BUILD_TEST (uint16x4_t, uint16x8_t, , q, u16, 3, 7)
@@ -51,7 +53,8 @@ BUILD_TEST (uint64x1_t, uint64x2_t,   , q, u64, 0, 1)
 BUILD_TEST (poly8x16_t, poly8x8_t, q, , p8, 15, 7)
 BUILD_TEST (int8x16_t,  int8x8_t,  q, , s8, 15, 7)
 BUILD_TEST (uint8x16_t, uint8x8_t, q, , u8, 15, 7)
-/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[15\\\], v1.b\\\[7\\\]" 3 } } */
+BUILD_TEST (mfloat8x16_t, mfloat8x8_t, q, , mf8, 15, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[15\\\], v1.b\\\[7\\\]" 4 } } */
 BUILD_TEST (poly16x8_t, poly16x4_t, q, , p16, 7, 3)
 BUILD_TEST (int16x8_t,  int16x4_t,  q, , s16, 7, 3)
 BUILD_TEST (uint16x8_t, uint16x4_t, q, , u16, 7, 3)
@@ -70,7 +73,8 @@ BUILD_TEST (uint64x2_t,  uint64x1_t,  q, , u64, 1, 0)
 BUILD_TEST (poly8x16_t, poly8x16_t, q, q, p8, 14, 15)
 BUILD_TEST (int8x16_t,  int8x16_t,  q, q, s8, 14, 15)
 BUILD_TEST (uint8x16_t, uint8x16_t, q, q, u8, 14, 15)
-/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[14\\\], v1.b\\\[15\\\]" 3 } } */
+BUILD_TEST (mfloat8x16_t, mfloat8x16_t, q, q, mf8, 14, 15)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[14\\\], v1.b\\\[15\\\]" 4 } } */
 BUILD_TEST (poly16x8_t, poly16x8_t, q, q, p16, 6, 7)
 BUILD_TEST (int16x8_t,  int16x8_t,  q, q, s16, 6, 7)
 BUILD_TEST (uint16x8_t, uint16x8_t, q, q, u16, 6, 7)