[4/4] aarch64: Convert UABAL2 and SABAL2 patterns to standard RTL codes

The final patch in the series tackles the most complex of this family of patterns, UABAL2 and SABAL2. These extract the high part of the sources, perform an absdiff on them, widen the result and accumulate. The motivating testcase for this patch (series) is included and the simplification required doesn't actually trigger with just the RTL pattern change because rtx_costs block it. So this patch also extends rtx costs to recognise the (minus (smax (x, y) (smin (x, y)))) expression we use to describe absdiff in the backend and avoid recursing into its arms. This allows us to generate the single-instruction sequence expected here. Bootstrapped and tested on aarch64-none-linux-gnu. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_<sur>abal2<mode>): Rename to... (aarch64_<su>abal2<mode>_insn): ... This. Use RTL codes instead of unspec. (aarch64_<su>abal2<mode>): New define_expand. * config/aarch64/aarch64.cc (aarch64_abd_rtx_p): New function. (aarch64_rtx_costs): Handle ABD rtxes. * config/aarch64/aarch64.md (UNSPEC_SABAL2, UNSPEC_UABAL2): Delete. * config/aarch64/iterators.md (ABAL2): Delete. (sur): Remove handling of UNSPEC_UABAL2 and UNSPEC_SABAL2. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vabal_combine.c: New test.
author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2023-04-24 09:44:55 +0100
committer: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2023-04-24 09:44:55 +0100
commit: 636e2273aec555faa0a2f0e0b97b5f3355b25e47 (patch)
tree: bb830b6c6b1ddf0842046c2c89ed970bb6cbe9a2 /gcc
parent: e0472ed5aeeb908cb34be57a74c520c90bcb79d8 (diff)
download: gcc-636e2273aec555faa0a2f0e0b97b5f3355b25e47.zip
gcc-636e2273aec555faa0a2f0e0b97b5f3355b25e47.tar.gz
gcc-636e2273aec555faa0a2f0e0b97b5f3355b25e47.tar.bz2
5 files changed, 144 insertions, 15 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 51bb6cf..e420f58 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -969,17 +969,46 @@
   [(set_attr "type" "neon_arith_acc<q>")]
 )
 
-(define_insn "aarch64_<sur>abal2<mode>"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-	(unspec:<VWIDE> [(match_operand:VQW 2 "register_operand" "w")
-			  (match_operand:VQW 3 "register_operand" "w")
-			 (match_operand:<VWIDE> 1 "register_operand" "0")]
-	ABAL2))]
+(define_insn "aarch64_<su>abal2<mode>_insn"
+  [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+	(plus:<VDBLW>
+	  (zero_extend:<VDBLW>
+	    (minus:<VHALF>
+	      (USMAX:<VHALF>
+		(vec_select:<VHALF>
+		  (match_operand:VQW 2 "register_operand" "w")
+		  (match_operand:VQW 4 "vect_par_cnst_hi_half" ""))
+		(vec_select:<VHALF>
+		  (match_operand:VQW 3 "register_operand" "w")
+		  (match_dup 4)))
+	      (<max_opp>:<VHALF>
+		(vec_select:<VHALF>
+		  (match_dup 2)
+		  (match_dup 4))
+		(vec_select:<VHALF>
+		  (match_dup 3)
+		  (match_dup 4)))))
+	  (match_operand:<VDBLW> 1 "register_operand" "0")))]
   "TARGET_SIMD"
-  "<sur>abal2\t%0.<Vwtype>, %2.<Vtype>, %3.<Vtype>"
+  "<su>abal2\t%0.<Vwtype>, %2.<Vtype>, %3.<Vtype>"
   [(set_attr "type" "neon_arith_acc<q>")]
 )
 
+(define_expand "aarch64_<su>abal2<mode>"
+  [(match_operand:<VDBLW> 0 "register_operand")
+   (match_operand:<VDBLW> 1 "register_operand")
+   (USMAX:VQW
+     (match_operand:VQW 2 "register_operand")
+     (match_operand:VQW 3 "register_operand"))]
+  "TARGET_SIMD"
+  {
+    rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+    emit_insn (gen_aarch64_<su>abal2<mode>_insn (operands[0], operands[1],
+						 operands[2], operands[3], hi));
+    DONE;
+  }
+)
+
 (define_insn "aarch64_<sur>adalp<mode>"
   [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
 	(unspec:<VDBLW> [(match_operand:VDQV_L 2 "register_operand" "w")
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d7e895f..2b0de7c 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -13822,6 +13822,31 @@ aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
   return (t == (t & -t));
 }
 
+/* Return true if X is an RTX representing an operation in the ABD family
+   of instructions.  */
+
+static bool
+aarch64_abd_rtx_p (rtx x)
+{
+  if (GET_CODE (x) != MINUS)
+    return false;
+  rtx max_arm = XEXP (x, 0);
+  rtx min_arm = XEXP (x, 1);
+  if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
+    return false;
+  bool signed_p = GET_CODE (max_arm) == SMAX;
+  if (signed_p && GET_CODE (min_arm) != SMIN)
+    return false;
+  else if (!signed_p && GET_CODE (min_arm) != UMIN)
+    return false;
+
+  rtx maxop0 = XEXP (max_arm, 0);
+  rtx maxop1 = XEXP (max_arm, 1);
+  rtx minop0 = XEXP (min_arm, 0);
+  rtx minop1 = XEXP (min_arm, 1);
+  return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
+}
+
 /* Calculate the cost of calculating X, storing it in *COST.  Result
    is true if the total cost of the operation has now been calculated.  */
 static bool
@@ -14218,11 +14243,20 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 cost_minus:
 	if (VECTOR_MODE_P (mode))
 	  {
-	    /* SUBL2 and SUBW2.  */
 	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
 	    if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
 	      {
-		/* The select-operand-high-half versions of the sub instruction
+		/* Recognise the SABD and UABD operation here.
+		   Recursion from the PLUS case will catch the accumulating
+		   forms.  */
+		if (aarch64_abd_rtx_p (x))
+		  {
+		    if (speed)
+		      *cost += extra_cost->vect.alu;
+		    return true;
+		  }
+		  /* SUBL2 and SUBW2.
+		   The select-operand-high-half versions of the sub instruction
 		   have the same cost as the regular three vector version -
 		   don't add the costs of the select into the costs of the sub.
 		   */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 426eb85..3e18f04 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -204,7 +204,6 @@
     UNSPEC_PRLG_STK
     UNSPEC_REV
     UNSPEC_RBIT
-    UNSPEC_SABAL2
     UNSPEC_SADALP
     UNSPEC_SCVTF
     UNSPEC_SETMEM
@@ -225,7 +224,6 @@
     UNSPEC_TLSLE24
     UNSPEC_TLSLE32
     UNSPEC_TLSLE48
-    UNSPEC_UABAL2
     UNSPEC_UADALP
     UNSPEC_UCVTF
     UNSPEC_USHL_2S
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 0195cdc..13a7e89 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2567,9 +2567,6 @@
 ;; Int Iterators.
 ;; -------------------------------------------------------------------
 
-;; The unspec codes for the SABAL2, UABAL2 AdvancedSIMD instructions.
-(define_int_iterator ABAL2 [UNSPEC_SABAL2 UNSPEC_UABAL2])
-
 ;; The unspec codes for the SADALP, UADALP AdvancedSIMD instructions.
 (define_int_iterator ADALP [UNSPEC_SADALP UNSPEC_UADALP])
 
@@ -3351,7 +3348,6 @@
 		      (UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur")
 		      (UNSPEC_SHSUB "s") (UNSPEC_UHSUB "u")
 		      (UNSPEC_ADDHN "") (UNSPEC_RADDHN "r")
-		      (UNSPEC_SABAL2 "s") (UNSPEC_UABAL2 "u")
 		      (UNSPEC_SADALP "s") (UNSPEC_UADALP "u")
 		      (UNSPEC_SUBHN "") (UNSPEC_RSUBHN "r")
 		      (UNSPEC_USQADD "us") (UNSPEC_SUQADD "su")
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c b/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
new file mode 100644
index 0000000..c51878a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** test_vabal_s8:
+**      sabal2	v0.8h, v2.16b, v1.16b
+**      ret
+*/
+int16x8_t
+test_vabal_s8 (int16x8_t sadv, int8x16_t pv, int8x16_t sv)
+{
+  return vabal_s8 (sadv, vget_high_s8 (pv), vget_high_s8 (sv));
+}
+
+/*
+** test_vabal_u8:
+**      uabal2	v0.8h, v2.16b, v1.16b
+**      ret
+*/
+uint16x8_t
+test_vabal_u8 (uint16x8_t sadv, uint8x16_t pv, uint8x16_t sv)
+{
+  return vabal_u8 (sadv, vget_high_u8 (pv), vget_high_u8 (sv));
+}
+
+/*
+** test_vabal_s16:
+**      sabal2	v0.4s, v2.8h, v1.8h
+**      ret
+*/
+int32x4_t
+test_vabal_s16 (int32x4_t sadv, int16x8_t pv, int16x8_t sv)
+{
+  return vabal_s16 (sadv, vget_high_s16 (pv), vget_high_s16 (sv));
+}
+
+/*
+** test_vabal_u16:
+**      uabal2	v0.4s, v2.8h, v1.8h
+**      ret
+*/
+uint32x4_t
+test_vabal_u16 (uint32x4_t sadv, uint16x8_t pv, uint16x8_t sv)
+{
+  return vabal_u16 (sadv, vget_high_u16 (pv), vget_high_u16 (sv));
+}
+
+/*
+** test_vabal_s32:
+**      sabal2	v0.2d, v2.4s, v1.4s
+**      ret
+*/
+int64x2_t
+test_vabal_s32 (int64x2_t sadv, int32x4_t pv, int32x4_t sv)
+{
+  return vabal_s32 (sadv, vget_high_s32 (pv), vget_high_s32 (sv));
+}
+
+/*
+** test_vabal_u32:
+**      uabal2	v0.2d, v2.4s, v1.4s
+**      ret
+*/
+uint64x2_t
+test_vabal_u32 (uint64x2_t sadv, uint32x4_t pv, uint32x4_t sv)
+{
+  return vabal_u32 (sadv, vget_high_u32 (pv), vget_high_u32 (sv));
+}
+
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2023-04-24 09:44:55 +0100
committer	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2023-04-24 09:44:55 +0100
commit	636e2273aec555faa0a2f0e0b97b5f3355b25e47 (patch)
tree	bb830b6c6b1ddf0842046c2c89ed970bb6cbe9a2 /gcc
parent	e0472ed5aeeb908cb34be57a74c520c90bcb79d8 (diff)
download	gcc-636e2273aec555faa0a2f0e0b97b5f3355b25e47.zip gcc-636e2273aec555faa0a2f0e0b97b5f3355b25e47.tar.gz gcc-636e2273aec555faa0a2f0e0b97b5f3355b25e47.tar.bz2