aarch64: Don't include vec_select high-half in SIMD subtract cost

The Neon subtract-long/subract-widen instructions can select the top or bottom half of the operand registers. This selection does not change the cost of the underlying instruction and this should be reflected by the RTL cost function. This patch adds RTL tree traversal in the Neon subtract cost function to match vec_select high-half of its operands. This traversal prevents the cost of the vec_select from being added into the cost of the subtract - meaning that these instructions can now be emitted in the combine pass as they are no longer deemed prohibitively expensive. gcc/ChangeLog: 2021-07-28 Jonathan Wright <jonathan.wright@arm.com> * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost of vec_select high-half from being added into Neon subtract cost. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vsubX_high_cost.c: New test.
author: Jonathan Wright <jonathan.wright@arm.com> 2021-07-28 17:45:36 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> 2021-08-05 11:52:13 +0100
commit: 0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737 (patch)
tree: 14bc4fe5016ee5661f51deac402500bf5fc94b53
parent: 8cd27a3b25558e5be7f8595fc1c828bc46641671 (diff)
download: gcc-0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737.zip
gcc-0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737.tar.gz
gcc-0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737.tar.bz2
2 files changed, 53 insertions, 0 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index aa687c5..30f8365 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13089,6 +13089,21 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 	op1 = XEXP (x, 1);
 
 cost_minus:
+	if (VECTOR_MODE_P (mode))
+	  {
+	    /* SUBL2 and SUBW2.  */
+	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+	    if (vec_flags & VEC_ADVSIMD)
+	      {
+		/* The select-operand-high-half versions of the sub instruction
+		   have the same cost as the regular three vector version -
+		   don't add the costs of the select into the costs of the sub.
+		   */
+		op0 = aarch64_strip_extend_vec_half (op0);
+		op1 = aarch64_strip_extend_vec_half (op1);
+	      }
+	  }
+
 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
 
 	/* Detect valid immediates.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
new file mode 100644
index 0000000..09bc7fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_SUBL(rettype, intype, ts, rs) \
+  rettype test_vsubl_ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = vsubl_ ## ts (vget_high_ ## ts (a), \
+					   vget_high_ ## ts (c)); \
+		rettype t1 = vsubl_ ## ts (vget_high_ ## ts (b), \
+					   vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_SUBL (int16x8_t, int8x16_t, s8, s16)
+TEST_SUBL (uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBL (int32x4_t, int16x8_t, s16, s32)
+TEST_SUBL (uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBL (int64x2_t, int32x4_t, s32, s64)
+TEST_SUBL (uint64x2_t, uint32x4_t, u32, u64)
+
+#define TEST_SUBW(rettype, intype, intypel, ts, rs) \
+  rettype test_vsubw_ ## ts (intype a, intype b, intypel c) \
+	{ \
+		rettype t0 = vsubw_ ## ts (a, vget_high_ ## ts (c)); \
+		rettype t1 = vsubw_ ## ts (b, vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_SUBW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
+TEST_SUBW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
+TEST_SUBW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
+TEST_SUBW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
author	Jonathan Wright <jonathan.wright@arm.com>	2021-07-28 17:45:36 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	2021-08-05 11:52:13 +0100
commit	0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737 (patch)
tree	14bc4fe5016ee5661f51deac402500bf5fc94b53
parent	8cd27a3b25558e5be7f8595fc1c828bc46641671 (diff)
download	gcc-0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737.zip gcc-0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737.tar.gz gcc-0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737.tar.bz2