aarch64: add SVE2 FP8DOT2 and FP8DOT4 intrinsics

This patch adds support for the following intrinsics: - svdot[_f32_mf8]_fpm - svdot_lane[_f32_mf8]_fpm - svdot[_f16_mf8]_fpm - svdot_lane[_f16_mf8]_fpm The first two are available under a combination of the FP8DOT4 and SVE2 features. Alternatively under the SSVE_FP8DOT4 feature under streaming mode. The final two are available under a combination of the FP8DOT2 and SVE2 features. Alternatively under the SSVE_FP8DOT2 feature under streaming mode. gcc/ * config/aarch64/aarch64-option-extensions.def (fp8dot4, ssve-fp8dot4): Add new extensions. (fp8dot2, ssve-fp8dot2): Likewise. * config/aarch64/aarch64-sve-builtins-base.cc (svdot_impl): Support fp8. (svdotprod_lane_impl): Likewise. (svdot_lane): Provide an unspec for fp8 types. * config/aarch64/aarch64-sve-builtins-shapes.cc (ternary_mfloat8_def): Add new class. (ternary_mfloat8): Add new shape. (ternary_mfloat8_lane_group_selection_def): Add new class. (ternary_mfloat8_lane_group_selection): Add new shape. * config/aarch64/aarch64-sve-builtins-shapes.h (ternary_mfloat8, ternary_mfloat8_lane_group_selection): Declare. * config/aarch64/aarch64-sve-builtins-sve2.def (svdot, svdot_lane): Add new DEF_SVE_FUNCTION_GS_FPM, twice to deal with the combination of features providing support for 32 and 16 bit floating point. * config/aarch64/aarch64-sve2.md (@aarch64_sve_dot<mode>): Add new. (@aarch64_sve_dot_lane<mode>): Likewise. * config/aarch64/aarch64.h: (TARGET_FP8DOT4, TARGET_SSVE_FP8DOT4): Add new defines. (TARGET_FP8DOT2, TARGET_SSVE_FP8DOT2): Likewise. * config/aarch64/iterators.md (UNSPEC_DOT_FP8, UNSPEC_DOT_LANE_FP8): Add new unspecs. * doc/invoke.texi: Document fp8dot4, fp8dot2, ssve-fp8dot4, ssve-fp8dot2 extensions. gcc/testsuite/ * gcc.target/aarch64/sve/acle/general-c/ternary_mfloat8_1.c: Add new. gcc.target/aarch64/sve/acle/general-c/ternary_mfloat8_lane_group_selection_1.c: Likewise. * gcc.target/aarch64/sve2/acle/asm/dot_lane_mf8.c: Likewise. * gcc.target/aarch64/sve2/acle/asm/dot_mf8.c: Likewise. * lib/target-supports.exp: Add dg-require-effective-target support for aarch64_asm_fp8dot2_ok, aarch64_asm_fp8dot4_ok, aarch64_asm_ssve-fp8dot2_ok and aarch64_asm_ssve-fp8dot4_ok.
author: Claudio Bantaloukas <claudio.bantaloukas@arm.com> 2024-11-29 12:52:45 +0000
committer: Richard Sandiford <richard.sandiford@arm.com> 2024-11-29 12:52:45 +0000
commit: 441f8d637d77d4e666bb0424af2335b1c8780890 (patch)
tree: 57eebf7d08482f1197cbce687b6fef31d8cc8239 /gcc/config
parent: 538204079b2fc9145e0cae61aacda493e1037327 (diff)
download: gcc-441f8d637d77d4e666bb0424af2335b1c8780890.zip
gcc-441f8d637d77d4e666bb0424af2335b1c8780890.tar.gz
gcc-441f8d637d77d4e666bb0424af2335b1c8780890.tar.bz2
8 files changed, 172 insertions, 23 deletions
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 002d5ab..90abb1c 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -251,6 +251,14 @@ AARCH64_OPT_EXTENSION("ssve-fp8fma", SSVE_FP8FMA, (SME2,FP8), (), (), "ssve-fp8f
 
 AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax")
 
+AARCH64_OPT_EXTENSION("fp8dot4", FP8DOT4, (FP8FMA), (), (), "fp8dot4")
+
+AARCH64_OPT_EXTENSION("ssve-fp8dot4", SSVE_FP8DOT4, (SSVE_FP8FMA), (), (), "ssve-fp8dot4")
+
+AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (FP8DOT4), (), (), "fp8dot2")
+
+AARCH64_OPT_EXTENSION("ssve-fp8dot2", SSVE_FP8DOT2, (SSVE_FP8DOT4), (), (), "ssve-fp8dot2")
+
 #undef AARCH64_OPT_FMV_EXTENSION
 #undef AARCH64_OPT_EXTENSION
 #undef AARCH64_FMV_FEATURE
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 95e66dc..b979419 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -838,21 +838,26 @@ public:
   rtx
   expand (function_expander &e) const override
   {
-    /* In the optab, the multiplication operands come before the accumulator
-       operand.  The optab is keyed off the multiplication mode.  */
-    e.rotate_inputs_left (0, 3);
     insn_code icode;
-    if (e.type_suffix_ids[1] == NUM_TYPE_SUFFIXES)
-      icode = e.convert_optab_handler_for_sign (sdot_prod_optab,
-						udot_prod_optab,
-						0, e.result_mode (),
-						GET_MODE (e.args[0]));
+    if (e.fpm_mode == aarch64_sve::FPM_set)
+      icode = code_for_aarch64_sve_dot (e.result_mode ());
     else
-      icode = (e.type_suffix (0).float_p
-	       ? CODE_FOR_aarch64_sve_fdotvnx4sfvnx8hf
-	       : e.type_suffix (0).unsigned_p
-	       ? CODE_FOR_udot_prodvnx4sivnx8hi
-	       : CODE_FOR_sdot_prodvnx4sivnx8hi);
+      {
+	/* In the optab, the multiplication operands come before the accumulator
+	   operand.  The optab is keyed off the multiplication mode.  */
+	e.rotate_inputs_left (0, 3);
+	if (e.type_suffix_ids[1] == NUM_TYPE_SUFFIXES)
+	  icode = e.convert_optab_handler_for_sign (sdot_prod_optab,
+						    udot_prod_optab,
+						    0, e.result_mode (),
+						    GET_MODE (e.args[0]));
+	else
+	  icode = (e.type_suffix (0).float_p
+		   ? CODE_FOR_aarch64_sve_fdotvnx4sfvnx8hf
+		   : e.type_suffix (0).unsigned_p
+		   ? CODE_FOR_udot_prodvnx4sivnx8hi
+		   : CODE_FOR_sdot_prodvnx4sivnx8hi);
+      }
     return e.use_unpred_insn (icode);
   }
 };
@@ -865,17 +870,24 @@ public:
   rtx
   expand (function_expander &e) const override
   {
+    insn_code icode;
     machine_mode mode0 = GET_MODE (e.args[0]);
     machine_mode mode1 = GET_MODE (e.args[1]);
-    /* Use the same ordering as the dot_prod_optab, with the
-       accumulator last.  */
-    e.rotate_inputs_left (0, 4);
-    int unspec = unspec_for (e);
-    insn_code icode;
-    if (unspec == UNSPEC_FDOT)
-      icode = CODE_FOR_aarch64_fdot_prod_lanevnx4sfvnx8hf;
+    if (e.fpm_mode == aarch64_sve::FPM_set)
+      {
+	icode = code_for_aarch64_sve_dot_lane (mode0);
+      }
     else
-      icode = code_for_aarch64_dot_prod_lane (unspec, mode0, mode1);
+      {
+	/* Use the same ordering as the dot_prod_optab, with the
+	   accumulator last.  */
+	e.rotate_inputs_left (0, 4);
+	int unspec = unspec_for (e);
+	if (unspec == UNSPEC_FDOT)
+	  icode = CODE_FOR_aarch64_fdot_prod_lanevnx4sfvnx8hf;
+	else
+	  icode = code_for_aarch64_dot_prod_lane (unspec, mode0, mode1);
+      }
     return e.use_exact_insn (icode);
   }
 };
@@ -3255,7 +3267,7 @@ FUNCTION (svdiv, svdiv_impl,)
 FUNCTION (svdivr, rtx_code_function_rotated, (DIV, UDIV, UNSPEC_COND_FDIV))
 FUNCTION (svdot, svdot_impl,)
 FUNCTION (svdot_lane, svdotprod_lane_impl, (UNSPEC_SDOT, UNSPEC_UDOT,
-					    UNSPEC_FDOT))
+					    UNSPEC_FDOT, UNSPEC_DOT_LANE_FP8))
 FUNCTION (svdup, svdup_impl,)
 FUNCTION (svdup_lane, svdup_lane_impl,)
 FUNCTION (svdupq, svdupq_impl,)
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
index 94f4da8..cf3ddab 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
@@ -4005,6 +4005,34 @@ struct ternary_bfloat_def
 };
 SHAPE (ternary_bfloat)
 
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svmfloat8_t, svmfloat8_t).  */
+struct ternary_mfloat8_def
+    : public ternary_resize2_base<8, TYPE_mfloat, TYPE_mfloat>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const override
+  {
+    gcc_assert (group.fpm_mode == FPM_set);
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vM,vM", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+    type_suffix_index type;
+    if (!r.check_num_arguments (4)
+	|| (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES
+	|| !r.require_vector_type (1, VECTOR_TYPE_svmfloat8_t)
+	|| !r.require_vector_type (2, VECTOR_TYPE_svmfloat8_t)
+	|| !r.require_scalar_type (3, "uint64_t"))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type, TYPE_SUFFIX_mf8, GROUP_none);
+  }
+};
+SHAPE (ternary_mfloat8)
+
 /* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloat16_t, svbfloat16_t, uint64_t)
 
    where the final argument is an integer constant expression in the range
@@ -4057,6 +4085,26 @@ struct ternary_mfloat8_lane_def
 };
 SHAPE (ternary_mfloat8_lane)
 
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svmfloat8_t, svmfloat8_t, uint64_t)
+
+   where the final argument is an integer constant expression in the range
+   [0, 7] or [0, 3].  */
+struct ternary_mfloat8_lane_group_selection_def
+    : public ternary_mfloat8_lane_def
+{
+  bool
+  check (function_checker &c) const override
+  {
+    machine_mode mode = c.vector_mode (0);
+    if (mode == E_VNx8HFmode)
+      return c.require_immediate_lane_index (3, 2, 2);
+    else if (mode == E_VNx4SFmode)
+      return c.require_immediate_lane_index (3, 2, 4);
+    gcc_unreachable ();
+  }
+};
+SHAPE (ternary_mfloat8_lane_group_selection)
+
 /* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloatt16_t, svbfloat16_t)
    sv<t0>_t svfoo[_n_t0](sv<t0>_t, svbfloat16_t, bfloat16_t).  */
 struct ternary_bfloat_opt_n_def
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.h b/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
index 1c8937a..c7e448c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
@@ -71,7 +71,11 @@ namespace aarch64_sve
        scalar displacement".
 
      - "_pred" indicates that the function takes an svbool_t argument
-       that does not act as a governing predicate..  */
+       that does not act as a governing predicate..
+
+     - "_group_selection" indicates that the function takes an imm integer
+       argument that selects a specific group of elements that fit a 128 bit
+       vector. */
   namespace shapes
   {
     extern const function_shape *const adr_index;
@@ -213,7 +217,9 @@ namespace aarch64_sve
     extern const function_shape *const ternary_lane_rotate;
     extern const function_shape *const ternary_long_lane;
     extern const function_shape *const ternary_long_opt_n;
+    extern const function_shape *const ternary_mfloat8;
     extern const function_shape *const ternary_mfloat8_lane;
+    extern const function_shape *const ternary_mfloat8_lane_group_selection;
     extern const function_shape *const ternary_mfloat8_opt_n;
     extern const function_shape *const ternary_opt_n;
     extern const function_shape *const ternary_qq_or_011_lane;
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
index b489e8f..082dec1 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
@@ -396,3 +396,17 @@ DEF_SVE_FUNCTION_GS_FPM (svmlallbb_lane, ternary_mfloat8_lane, s_float_mf8, none
 DEF_SVE_FUNCTION_GS_FPM (svmlallbt_lane, ternary_mfloat8_lane, s_float_mf8, none, none, set)
 DEF_SVE_FUNCTION_GS_FPM (svmlalltb_lane, ternary_mfloat8_lane, s_float_mf8, none, none, set)
 #undef REQUIRED_EXTENSIONS
+
+#define REQUIRED_EXTENSIONS \
+  streaming_compatible (AARCH64_FL_SVE2 | AARCH64_FL_FP8DOT4, \
+			AARCH64_FL_SSVE_FP8DOT4)
+DEF_SVE_FUNCTION_GS_FPM (svdot, ternary_mfloat8, s_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svdot_lane, ternary_mfloat8_lane_group_selection, s_float_mf8, none, none, set)
+#undef REQUIRED_EXTENSIONS
+
+#define REQUIRED_EXTENSIONS \
+  streaming_compatible (AARCH64_FL_SVE2 | AARCH64_FL_FP8DOT2, \
+			AARCH64_FL_SSVE_FP8DOT2)
+DEF_SVE_FUNCTION_GS_FPM (svdot, ternary_mfloat8, h_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svdot_lane, ternary_mfloat8_lane_group_selection, h_float_mf8, none, none, set)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 5498eac..219e9fc 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -68,6 +68,7 @@
 ;; ---- [INT] Shift-and-insert operations
 ;; ---- [INT] Sum of absolute differences
 ;; ---- [FP] Mfloat8 Multiply-and-accumulate operations
+;; ---- [FP] Mfloat8 dot products
 ;;
 ;; == Extending arithmetic
 ;; ---- [INT] Multi-register widening conversions
@@ -2074,6 +2075,46 @@
   }
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [FP] Mfloat8 dot products
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FDOT (4-way, vectors)
+;; - FDOT (4-way, indexed)
+;; - FDOT (2-way, vectors)
+;; - FDOT (2-way, indexed)
+;; -------------------------------------------------------------------------
+(define_insn "@aarch64_sve_dot<mode>"
+  [(set (match_operand:SVE_FULL_HSF 0 "register_operand")
+	(unspec:SVE_FULL_HSF
+	  [(match_operand:SVE_FULL_HSF 1 "register_operand")
+	   (match_operand:VNx16QI 2 "register_operand")
+	   (match_operand:VNx16QI 3 "register_operand")
+	   (reg:DI FPM_REGNUM)]
+	  UNSPEC_DOT_FP8))]
+  "TARGET_SSVE_FP8DOT4 && !(<MODE>mode == VNx8HFmode && !TARGET_SSVE_FP8DOT2)"
+  {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+     [ w        , 0 , w , w ; *              ] fdot\t%0.<Vetype>, %2.b, %3.b
+     [ ?&w      , w , w , w ; yes            ] movprfx\t%0, %1\;fdot\t%0.<Vetype>, %2.b, %3.b
+  }
+)
+
+(define_insn "@aarch64_sve_dot_lane<mode>"
+  [(set (match_operand:SVE_FULL_HSF 0 "register_operand")
+	(unspec:SVE_FULL_HSF
+	  [(match_operand:SVE_FULL_HSF 1 "register_operand")
+	   (match_operand:VNx16QI 2 "register_operand")
+	   (match_operand:VNx16QI 3 "register_operand")
+	   (match_operand:SI 4 "const_int_operand")
+	   (reg:DI FPM_REGNUM)]
+	  UNSPEC_DOT_LANE_FP8))]
+  "TARGET_SSVE_FP8DOT4 && !(<MODE>mode == VNx8HFmode && !TARGET_SSVE_FP8DOT2)"
+  {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+     [ w        , 0 , w , y ; *              ] fdot\t%0.<Vetype>, %2.b, %3.b[%4]
+     [ ?&w      , w , w , y ; yes            ] movprfx\t%0, %1\;fdot\t%0.<Vetype>, %2.b, %3.b[%4]
+  }
+)
+
 ;; =========================================================================
 ;; == Extending arithmetic
 ;; =========================================================================
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 80a1fa4..53b4f88 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -527,6 +527,24 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
   (((TARGET_SVE2 && TARGET_FP8FMA) || TARGET_STREAMING) \
    && (AARCH64_HAVE_ISA (SSVE_FP8FMA) || TARGET_NON_STREAMING))
 
+/* fp8 four way dot product enabled through +fp8dot4.  */
+#define TARGET_FP8DOT4 AARCH64_HAVE_ISA (FP8DOT4)
+
+/* Streaming versions of fp8 four way dot product instructions are enabled
+through +ssve-fp8dot4.  */
+#define TARGET_SSVE_FP8DOT4 ((\
+		(TARGET_SVE2 && TARGET_FP8DOT4) || TARGET_STREAMING) \
+		&& (AARCH64_HAVE_ISA(SSVE_FP8DOT4) || TARGET_NON_STREAMING))
+
+/* fp8 two way dot product enabled through +fp8dot2.  */
+#define TARGET_FP8DOT2 AARCH64_HAVE_ISA (FP8DOT2)
+
+/* Streaming versions of fp8 two way dot product instructions are enabled
+through +ssve-fp8dot2.  */
+#define TARGET_SSVE_FP8DOT2 ((\
+		(TARGET_SVE2 && TARGET_FP8DOT2) || TARGET_STREAMING) \
+		&& (AARCH64_HAVE_ISA(SSVE_FP8DOT2) || TARGET_NON_STREAMING))
+
 /* Standard register usage.  */
 
 /* 31 64-bit general purpose registers R0-R30:
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 4b265a7..4786b02 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -962,6 +962,8 @@
     UNSPEC_COND_FCVTX	; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTXNT	; Used in aarch64-sve2.md.
     UNSPEC_COND_FLOGB	; Used in aarch64-sve2.md.
+    UNSPEC_DOT_FP8	; Used in aarch64-sve2.md.
+    UNSPEC_DOT_LANE_FP8	; Used in aarch64-sve2.md.
     UNSPEC_EORBT	; Used in aarch64-sve2.md.
     UNSPEC_EORTB	; Used in aarch64-sve2.md.
     UNSPEC_F1CVT	; Used in aarch64-sve2.md.
author	Claudio Bantaloukas <claudio.bantaloukas@arm.com>	2024-11-29 12:52:45 +0000
committer	Richard Sandiford <richard.sandiford@arm.com>	2024-11-29 12:52:45 +0000
commit	441f8d637d77d4e666bb0424af2335b1c8780890 (patch)
tree	57eebf7d08482f1197cbce687b6fef31d8cc8239 /gcc/config
parent	538204079b2fc9145e0cae61aacda493e1037327 (diff)
download	gcc-441f8d637d77d4e666bb0424af2335b1c8780890.zip gcc-441f8d637d77d4e666bb0424af2335b1c8780890.tar.gz gcc-441f8d637d77d4e666bb0424af2335b1c8780890.tar.bz2