AArch64: Add better costing for vector constants and operations

This patch adds extended costing to cost the creation of constants and the manipulation of constants. The default values provided are based on architectural expectations and each cost models can be individually tweaked as needed. The changes in this patch covers: * Construction of PARALLEL or CONST_VECTOR: Adds better costing for vector of constants which is based on the constant being created and the instruction that can be used to create it. i.e. a movi is cheaper than a literal load etc. * Construction of a vector through a vec_dup. gcc/ChangeLog: * config/arm/aarch-common-protos.h (struct vector_cost_table): Add movi, dup and extract costing fields. * config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs, thunderx_extra_costs, thunderx2t99_extra_costs, thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use them. * config/arm/aarch-cost-tables.h (generic_extra_costs, cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs, exynosm1_extra_costs, xgene1_extra_costs): Likewise * config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup. * config/aarch64/aarch64.c (aarch64_rtx_costs): Add extra costs. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vect-cse-codegen.c: New test.
author: Tamar Christina <tamar.christina@arm.com> 2021-11-01 13:49:46 +0000
committer: Tamar Christina <tamar.christina@arm.com> 2021-11-01 13:49:46 +0000
commit: 1d5c43db79b7ea14f0dc071beb40cf80de90eb86 (patch)
tree: 7ff0042d0c3e7465e31e34ee3c79575a845d1298
parent: 68b48f3f4c49132cc6bfb16e65f8b6fd939689c7 (diff)
download: gcc-1d5c43db79b7ea14f0dc071beb40cf80de90eb86.zip
gcc-1d5c43db79b7ea14f0dc071beb40cf80de90eb86.tar.gz
gcc-1d5c43db79b7ea14f0dc071beb40cf80de90eb86.tar.bz2
6 files changed, 188 insertions, 20 deletions
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7..bb499a1 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 29f3817..61c3d7e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 8448e56..fd9249c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12701,7 +12701,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -13462,8 +13462,7 @@ cost_plus:
 
 	 we must cost the explicit register move.  */
       if (mode == DImode
-	  && GET_MODE (op0) == SImode
-	  && outer == SET)
+	  && GET_MODE (op0) == SImode)
 	{
 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
 
@@ -14002,8 +14001,39 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case CONST_VECTOR:
+	{
+	  /* Load using MOVI/MVNI.  */
+	  if (aarch64_simd_valid_immediate (x, NULL))
+	    *cost = extra_cost->vect.movi;
+	  else /* Load using constant pool.  */
+	    *cost = extra_cost->ldst.load;
+	  break;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+    case VEC_DUPLICATE:
+	/* Load using a DUP.  */
+	*cost = extra_cost->vect.dup;
+	return false;
+    case VEC_SELECT:
+	{
+	  rtx op0 = XEXP (x, 0);
+	  *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
 
-      /* Fall through.  */
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	    *cost = extra_cost->vect.dup;
+	  else
+	    *cost = extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1..55a470d 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702..0e6a626 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000..d025e98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,97 @@
+/* { dg-do compile  { target { lp64 } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x[0-9]+, .LC[0-1]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	str	q[0-9]+, \[x1\]
+**	fmov	w[0-9]+, s[0-9]+
+**	orr	w[0-9]+, w[0-9]+, w[0-9]+
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
+**	mov	x[0-9]+, 16512
+**	movk	x[0-9]+, 0x1020, lsl 16
+**	movk	x[0-9]+, 0x408, lsl 32
+**	movk	x[0-9]+, 0x102, lsl 48
+**	fmov	d[0-9]+, x[0-9]+
+**	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+**	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
+**	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+**	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+**	umov	w[0-9]+, v[0-9]+.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+
author	Tamar Christina <tamar.christina@arm.com>	2021-11-01 13:49:46 +0000
committer	Tamar Christina <tamar.christina@arm.com>	2021-11-01 13:49:46 +0000
commit	1d5c43db79b7ea14f0dc071beb40cf80de90eb86 (patch)
tree	7ff0042d0c3e7465e31e34ee3c79575a845d1298
parent	68b48f3f4c49132cc6bfb16e65f8b6fd939689c7 (diff)
download	gcc-1d5c43db79b7ea14f0dc071beb40cf80de90eb86.zip gcc-1d5c43db79b7ea14f0dc071beb40cf80de90eb86.tar.gz gcc-1d5c43db79b7ea14f0dc071beb40cf80de90eb86.tar.bz2