10 files changed, 380 insertions, 15 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index 9b7280a..a71c8c9 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -780,7 +780,7 @@ typedef struct
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
 
 #undef ENTRY
-#define ENTRY(N, S, M0, M1, M2, M3, USES_FPMR, U)	\
+#define ENTRY(N, S, M0, M1, M2, M3, M4, USES_FPMR, U)	\
   AARCH64_##N,
 
 enum aarch64_builtins
@@ -1590,9 +1590,10 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma)
 
 enum class aarch64_builtin_signatures
 {
+  unary,
   binary,
   ternary,
-  unary,
+  quaternary,
 };
 
 namespace {
@@ -1617,6 +1618,7 @@ namespace simd_types {
   constexpr simd_type s16q { V8HImode, qualifier_none };
   constexpr simd_type u16q { V8HImode, qualifier_unsigned };
 
+  constexpr simd_type s32_index { SImode, qualifier_lane_index };
   constexpr simd_type s32 { V2SImode, qualifier_none };
   constexpr simd_type s32q { V4SImode, qualifier_none };
 
@@ -1642,10 +1644,10 @@ namespace simd_types {
 }
 
 #undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, T3, USES_FPMR, U)		      \
+#define ENTRY(N, S, T0, T1, T2, T3, T4, USES_FPMR, U)		      \
   {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
-      simd_types::T2, simd_types::T3, U, USES_FPMR,		      \
-      aarch64_required_extensions::REQUIRED_EXTENSIONS},
+      simd_types::T2, simd_types::T3, simd_types::T4, U,	      \
+      USES_FPMR, aarch64_required_extensions::REQUIRED_EXTENSIONS},
 
 /* Initialize pragma builtins.  */
 
@@ -1653,7 +1655,7 @@ struct aarch64_pragma_builtins_data
 {
   const char *name;
   aarch64_builtin_signatures signature;
-  simd_type types[4];
+  simd_type types[5];
   int unspec;
   bool uses_fpmr;
   aarch64_required_extensions required_extensions;
@@ -1672,6 +1674,8 @@ aarch64_get_number_of_args (const aarch64_pragma_builtins_data &builtin_data)
     return 2;
   else if (builtin_data.signature == aarch64_builtin_signatures::ternary)
     return 3;
+  else if (builtin_data.signature == aarch64_builtin_signatures::quaternary)
+    return 4;
   else
     // No other signature supported.
     gcc_unreachable ();
@@ -2504,6 +2508,72 @@ aarch64_general_required_extensions (unsigned int code)
   return ext::streaming_compatible (0);
 }
 
+namespace function_checker {
+
+void
+require_integer_constant (location_t location, tree arg)
+{
+  if (TREE_CODE (arg) != INTEGER_CST)
+    {
+      error_at (location, "Constant-type integer argument expected");
+      return;
+    }
+}
+
+void
+require_immediate_range (location_t location, tree arg, HOST_WIDE_INT min,
+			 HOST_WIDE_INT max)
+{
+  if (wi::to_widest (arg) < min || wi::to_widest (arg) > max)
+    {
+      error_at (location, "lane out of range %wd - %wd", min, max);
+      return;
+    }
+}
+
+/* Validates indexing into a vector using the index's size and the instruction,
+   where instruction is represented by the unspec.
+   This only works for intrinsics declared using pragmas in
+   aarch64-simd-pragma-builtins.def.  */
+
+void
+check_simd_lane_bounds (location_t location, const aarch64_pragma_builtins_data
+			*builtin_data, tree *args)
+{
+  if (builtin_data == NULL)
+    // Don't check for functions that are not declared in
+    // aarch64-simd-pragma-builtins.def.
+    return;
+
+  auto nargs = aarch64_get_number_of_args (*builtin_data);
+  switch (builtin_data->unspec)
+    {
+    case UNSPEC_VDOT2:
+    case UNSPEC_VDOT4:
+      {
+	if (builtin_data->types[nargs].qualifiers != qualifier_lane_index)
+	  break;
+
+	auto index_arg = args[nargs - 1];
+	require_integer_constant (location, index_arg);
+
+	auto vector_to_index_mode = builtin_data->types[nargs - 1].mode;
+	int vector_to_index_mode_size
+	  = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
+
+	auto low = 0;
+	int high
+	  = builtin_data->unspec == UNSPEC_VDOT2
+	  ? vector_to_index_mode_size / 2 - 1
+	  : vector_to_index_mode_size / 4 - 1;
+	require_immediate_range (location, index_arg, low, high);
+	break;
+      }
+    }
+}
+
+};
+
 bool
 aarch64_general_check_builtin_call (location_t location, vec<location_t>,
 				    unsigned int code, tree fndecl,
@@ -2515,6 +2585,9 @@ aarch64_general_check_builtin_call (location_t location, vec<location_t>,
   if (!aarch64_check_required_extensions (location, decl, required_extensions))
     return false;
 
+  auto builtin_data = aarch64_get_pragma_builtin (code);
+  function_checker::check_simd_lane_bounds (location, builtin_data, args);
+
   switch (code)
     {
     case AARCH64_RSR:
@@ -3477,6 +3550,28 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
       expand_insn (icode, nargs + 1, ops);
       break;
 
+    case UNSPEC_VDOT2:
+    case UNSPEC_VDOT4:
+      if (builtin_data->signature == aarch64_builtin_signatures::ternary)
+	icode = code_for_aarch64 (builtin_data->unspec,
+				  builtin_data->types[0].mode,
+				  builtin_data->types[1].mode,
+				  builtin_data->types[2].mode,
+				  builtin_data->types[3].mode);
+      else if
+	(builtin_data->signature == aarch64_builtin_signatures::quaternary)
+	icode = code_for_aarch64 (builtin_data->unspec,
+				  builtin_data->types[0].mode,
+				  builtin_data->types[1].mode,
+				  builtin_data->types[2].mode,
+				  builtin_data->types[3].mode,
+				  builtin_data->types[4].mode);
+      else
+	gcc_unreachable ();
+
+      expand_insn (icode, nargs + 1, ops);
+      break;
+
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index b13366b..ae1472e 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -260,6 +260,10 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
 
   aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
 
+  aarch64_def_or_undef (TARGET_FP8DOT2, "__ARM_FEATURE_FP8DOT2", pfile);
+
+  aarch64_def_or_undef (TARGET_FP8DOT4, "__ARM_FEATURE_FP8DOT4", pfile);
+
   aarch64_def_or_undef (TARGET_LS64,
 			"__ARM_FEATURE_LS64", pfile);
   aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index c9d419a..44d2e18 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -236,6 +236,10 @@ AARCH64_OPT_EXTENSION("gcs", GCS, (), (), (), "gcs")
 
 AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8")
 
+AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (SIMD), (), (), "fp8dot2")
+
+AARCH64_OPT_EXTENSION("fp8dot4", FP8DOT4, (SIMD), (), (), "fp8dot4")
+
 AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax")
 
 #undef AARCH64_OPT_FMV_EXTENSION
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index 91897cf..4a94a66 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -20,20 +20,33 @@
 
 
 #undef ENTRY_BINARY
-#define ENTRY_BINARY(N, T0, T1, T2, U)		\
-  ENTRY (N, binary, T0, T1, T2, none, false, U)
+#define ENTRY_BINARY(N, T0, T1, T2, U)			\
+  ENTRY (N, binary, T0, T1, T2, none, none, false, U)
 
 #undef ENTRY_BINARY_FPM
-#define ENTRY_BINARY_FPM(N, T0, T1, T2, U)	\
-  ENTRY (N, binary, T0, T1, T2, none, true, U)
+#define ENTRY_BINARY_FPM(N, T0, T1, T2, U)		\
+  ENTRY (N, binary, T0, T1, T2, none, none, true, U)
 
 #undef ENTRY_TERNARY_FPM
-#define ENTRY_TERNARY_FPM(N, T0, T1, T2, T3, U) \
-  ENTRY (N, ternary, T0, T1, T2, T3, true, U)
+#define ENTRY_TERNARY_FPM(N, T0, T1, T2, T3, U)		\
+  ENTRY (N, ternary, T0, T1, T2, T3, none, true, U)
+
+#undef ENTRY_TERNARY_FPM_LANE
+#define ENTRY_TERNARY_FPM_LANE(N, T0, T1, T2, T3, U)	\
+  ENTRY (N, quaternary, T0, T1, T2, T3, s32_index, true, U)
 
 #undef ENTRY_UNARY_FPM
-#define ENTRY_UNARY_FPM(N, T0, T1, U) \
-  ENTRY (N, unary, T0, T1, none, none, true, U)
+#define ENTRY_UNARY_FPM(N, T0, T1, U)			\
+  ENTRY (N, unary, T0, T1, none, none, none, true, U)
+
+#undef ENTRY_VDOT_FPM
+#define ENTRY_VDOT_FPM(T, U)						\
+  ENTRY_TERNARY_FPM (vdot_##T##_mf8_fpm, T, T, f8, f8, U)		\
+  ENTRY_TERNARY_FPM (vdotq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U)	\
+  ENTRY_TERNARY_FPM_LANE (vdot_lane_##T##_mf8_fpm, T, T, f8, f8, U)	\
+  ENTRY_TERNARY_FPM_LANE (vdot_laneq_##T##_mf8_fpm, T, T, f8, f8q, U)	\
+  ENTRY_TERNARY_FPM_LANE (vdotq_lane_##T##_mf8_fpm, T##q, T##q, f8q, f8, U) \
+  ENTRY_TERNARY_FPM_LANE (vdotq_laneq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U)
 
 #undef ENTRY_VHSDF
 #define ENTRY_VHSDF(NAME, UNSPEC) \
@@ -83,3 +96,13 @@ ENTRY_TERNARY_FPM (vcvt_high_mf8_f32_fpm, f8q, f8, f32q, f32q, UNSPEC_VCVT_HIGH)
 #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
 ENTRY_VHSDF_VHSDI (vscale, UNSPEC_FSCALE)
 #undef REQUIRED_EXTENSIONS
+
+// fpm dot2 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT2)
+ENTRY_VDOT_FPM (f16, UNSPEC_VDOT2)
+#undef REQUIRED_EXTENSIONS
+
+// fpm dot4 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT4)
+ENTRY_VDOT_FPM (f32, UNSPEC_VDOT4)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f843746..7b97486 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -10097,3 +10097,61 @@
   "TARGET_FP8"
   "<fpm_uns_op>\t%0.<VHSDF:Vtype>, %1.<VHSDF:Vtype>, %2.<VHSDI:Vtype>"
 )
+
+;; fpm vdot2 instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB:mode>"
+  [(set (match_operand:VHF 0 "register_operand" "=w")
+	(unspec:VHF
+	 [(match_operand:VHF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT2_UNS))]
+  "TARGET_FP8DOT2"
+  "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot2 instructions with lane.
+(define_insn
+  "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+  [(set (match_operand:VHF 0 "register_operand" "=w")
+	(unspec:VHF
+	 [(match_operand:VHF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB2 3 "register_operand" "w")
+	  (match_operand:SI_ONLY 4 "const_int_operand" "n")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT2_UNS))]
+  "TARGET_FP8DOT2"
+  "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VHF:Vdotlanetype>[%4]"
+)
+
+;; fpm vdot4 instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT4_UNS))]
+  "TARGET_FP8DOT4"
+  "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot4 instructions with lane.
+(define_insn
+  "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB2 3 "register_operand" "w")
+	  (match_operand:SI_ONLY 4 "const_int_operand" "n")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT4_UNS))]
+  "TARGET_FP8DOT4"
+  "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VDQSF:Vdotlanetype>[%4]"
+)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index f07b2c4..c50a578 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -494,6 +494,12 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
   ((TARGET_SVE2p1 || TARGET_STREAMING) \
    && (TARGET_SME2 || TARGET_NON_STREAMING))
 
+/* fp8 dot product instructions are enabled through +fp8dot2.  */
+#define TARGET_FP8DOT2 AARCH64_HAVE_ISA (FP8DOT2)
+
+/* fp8 dot product instructions are enabled through +fp8dot4.  */
+#define TARGET_FP8DOT4 AARCH64_HAVE_ISA (FP8DOT4)
+
 /* Standard register usage.  */
 
 /* 31 64-bit general purpose registers R0-R30:
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index bdd276b..8c03dcd 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -163,6 +163,10 @@
 
 ;; Advanced SIMD Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
+
+(define_mode_iterator VHF [(V4HF "TARGET_SIMD_F16INST")
+			   (V8HF "TARGET_SIMD_F16INST")])
+
 (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF V2DF])
@@ -321,6 +325,7 @@
 
 ;; All byte modes.
 (define_mode_iterator VB [V8QI V16QI])
+(define_mode_iterator VB2 [VB])
 
 ;; 1 and 2 lane DI and DF modes.
 (define_mode_iterator V12DIF [V1DI V1DF V2DI V2DF])
@@ -764,6 +769,8 @@
     UNSPEC_VCVT2	; Used in aarch64-simd.md.
     UNSPEC_VCVT2_HIGH	; Used in aarch64-simd.md.
     UNSPEC_VCVT2_LOW	; Used in aarch64-simd.md.
+    UNSPEC_VDOT2 	; Used in aarch64-simd.md.
+    UNSPEC_VDOT4	; Used in aarch64-simd.md.
     UNSPEC_TBL		; Used in vector permute patterns.
     UNSPEC_TBLQ		; Used in vector permute patterns.
     UNSPEC_TBX		; Used in vector permute patterns.
@@ -2491,6 +2498,11 @@
 			    (VNx8HF ".h") (VNx16HF "") (VNx32HF "")
 			    (VNx8HI ".h") (VNx16HI "") (VNx32HI "")])
 
+
+;; Lane index suffix for fp8 vdot operations depends on the output mode
+(define_mode_attr Vdotlanetype [(V4HF "2b") (V8HF "2b")
+				(V2SF "4b") (V4SF "4b")])
+
 ;; The number of bytes controlled by a predicate
 (define_mode_attr data_bytes [(VNx16BI "1") (VNx8BI "2")
 			      (VNx4BI "4") (VNx2BI "8")])
@@ -4720,7 +4732,12 @@
    (UNSPEC_VCVT2_HIGH "f2cvtl2")
    (UNSPEC_VCVT2_LOW "f2cvtl")])
 
+(define_int_iterator FPM_VDOT2_UNS [UNSPEC_VDOT2])
+(define_int_iterator FPM_VDOT4_UNS [UNSPEC_VDOT4])
+
 (define_int_attr fpm_uns_op
   [(UNSPEC_FSCALE "fscale")
    (UNSPEC_VCVT "fcvtn")
-   (UNSPEC_VCVT_HIGH "fcvtn2")])
+   (UNSPEC_VCVT_HIGH "fcvtn2")
+   (UNSPEC_VDOT2 "fdot")
+   (UNSPEC_VDOT4 "fdot")])
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4a494f6..bc3f742 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21807,6 +21807,10 @@ Enable support for Armv8.9-a/9.4-a translation hardening extension.
 Enable the RCpc3 (Release Consistency) extension.
 @item fp8
 Enable the fp8 (8-bit floating point) extension.
+@item fp8dot2
+Enable the fp8dot2 (8-bit floating point dot product) extension.
+@item fp8dot4
+Enable the fp8dot4 (8-bit floating point dot product) extension.
 @item faminmax
 Enable the Floating Point Absolute Maximum/Minimum extension.
 
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
new file mode 100644
index 0000000..3e888a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.8b
+**	ret
+*/
+float16x4_t
+test_vdot_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.16b
+**	ret
+*/
+float16x8_t
+test_vdotq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.2b\[1\]
+**	ret
+*/
+float16x4_t
+test_vdot_lane_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.2b\[1\]
+**	ret
+*/
+float16x4_t
+test_vdot_laneq_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdot_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.2b\[1\]
+**	ret
+*/
+float16x8_t
+test_vdotq_lane_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdotq_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.2b\[1\]
+**	ret
+*/
+float16x8_t
+test_vdotq_laneq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c
new file mode 100644
index 0000000..f03dd0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.8b
+**	ret
+*/
+float32x2_t
+test_vdot_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.16b
+**	ret
+*/
+float32x4_t
+test_vdotq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.4b\[1\]
+**	ret
+*/
+float32x2_t
+test_vdot_lane_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.4b\[1\]
+**	ret
+*/
+float32x2_t
+test_vdot_laneq_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdot_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.4b\[1\]
+**	ret
+*/
+float32x4_t
+test_vdotq_lane_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdotq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.4b\[1\]
+**	ret
+*/
+float32x4_t
+test_vdotq_laneq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}