6 files changed, 435 insertions, 4 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index eaefbf6..cc6b7c0 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2994,6 +2994,19 @@ get_mem_type_for_load_store (unsigned int fcode)
   }
 }
 
+/* We've seen a vector load from address ADDR.  Record it in
+   vector_load_decls, if appropriate.  */
+static void
+aarch64_record_vector_load_arg (tree addr)
+{
+  tree decl = aarch64_vector_load_decl (addr);
+  if (!decl)
+    return;
+  if (!cfun->machine->vector_load_decls)
+    cfun->machine->vector_load_decls = hash_set<tree>::create_ggc (31);
+  cfun->machine->vector_load_decls->add (decl);
+}
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
@@ -3051,6 +3064,11 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
      BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
      BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
      BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
+	/* Punt until after inlining, so that we stand more chance of
+	   recording something meaningful in vector_load_decls.  */
+	if (!cfun->after_inlining)
+	  break;
+	aarch64_record_vector_load_arg (args[0]);
 	if (!BYTES_BIG_ENDIAN)
 	  {
 	    enum aarch64_simd_type mem_type
@@ -3069,6 +3087,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
 				     fold_build2 (MEM_REF,
 						  access_type,
 						  args[0], zero));
+	    gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+	    gimple_set_vdef (new_stmt, gimple_vdef (stmt));
 	  }
 	break;
 
@@ -3092,6 +3112,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
 	      = gimple_build_assign (fold_build2 (MEM_REF, access_type,
 						  args[0], zero),
 				     args[1]);
+	    gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+	    gimple_set_vdef (new_stmt, gimple_vdef (stmt));
 	  }
 	break;
 
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f75eb89..63339fa 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -761,6 +761,7 @@ bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
 bool aarch64_constant_address_p (rtx);
 bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem (rtx *);
 bool aarch64_expand_setmem (rtx *);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 661fff65..cc119d0 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15661,6 +15661,33 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
 
 /* Vectorizer cost model target hooks.  */
 
+/* If a vld1 from address ADDR should be recorded in vector_load_decls,
+   return the decl that should be recorded.  Return null otherwise.  */
+tree
+aarch64_vector_load_decl (tree addr)
+{
+  if (TREE_CODE (addr) != ADDR_EXPR)
+    return NULL_TREE;
+  tree base = get_base_address (TREE_OPERAND (addr, 0));
+  if (TREE_CODE (base) != VAR_DECL)
+    return NULL_TREE;
+  return base;
+}
+
+/* Return true if STMT_INFO accesses a decl that is known to be the
+   argument to a vld1 in the same function.  */
+static bool
+aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
+{
+  if (!cfun->machine->vector_load_decls)
+    return false;
+  auto dr = STMT_VINFO_DATA_REF (stmt_info);
+  if (!dr)
+    return false;
+  tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
+  return decl && cfun->machine->vector_load_decls->contains (decl);
+}
+
 /* Information about how the CPU would issue the scalar, Advanced SIMD
    or SVE version of a vector loop, using the scheme defined by the
    aarch64_base_vec_issue_info hierarchy of structures.  */
@@ -15891,6 +15918,20 @@ private:
      supported by Advanced SIMD and SVE2.  */
   bool m_has_avg = false;
 
+  /* True if the vector body contains a store to a decl and if the
+     function is known to have a vld1 from the same decl.
+
+     In the Advanced SIMD ACLE, the recommended endian-agnostic way of
+     initializing a vector is:
+
+       float f[4] = { elts };
+       float32x4_t x = vld1q_f32(f);
+
+     We should strongly prefer vectorization of the initialization of f,
+     so that the store to f and the load back can be optimized away,
+     leaving a vectorization of { elts }.  */
+  bool m_stores_to_vector_load_decl = false;
+
   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
        SIMD code.
@@ -16907,6 +16948,18 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	    }
 	}
     }
+
+  /* If the statement stores to a decl that is known to be the argument
+     to a vld1 in the same function, ignore the store for costing purposes.
+     See the comment above m_stores_to_vector_load_decl for more details.  */
+  if (stmt_info
+      && (kind == vector_store || kind == unaligned_store)
+      && aarch64_accesses_vector_load_decl_p (stmt_info))
+    {
+      stmt_cost = 0;
+      m_stores_to_vector_load_decl = true;
+    }
+
   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
 }
 
@@ -17196,12 +17249,21 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
 
   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
      the scalar code in the event of a tie, since there is more chance
-     of scalar code being optimized with surrounding operations.  */
+     of scalar code being optimized with surrounding operations.
+
+     In addition, if the vector body is a simple store to a decl that
+     is elsewhere loaded using vld1, strongly prefer the vector form,
+     to the extent of giving the prologue a zero cost.  See the comment
+     above m_stores_to_vector_load_decl for details.  */
   if (!loop_vinfo
       && scalar_costs
-      && m_stp_sequence_cost != ~0U
-      && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
-    m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+      && m_stp_sequence_cost != ~0U)
+    {
+      if (m_stores_to_vector_load_decl)
+	m_costs[vect_prologue] = 0;
+      else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
+	m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+    }
 
   vector_costs::finish_cost (scalar_costs);
 }
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 2403795..155cace 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -860,6 +860,7 @@ struct GTY (()) aarch64_frame
   bool is_scs_enabled;
 };
 
+#ifdef hash_set_h
 typedef struct GTY (()) machine_function
 {
   struct aarch64_frame frame;
@@ -868,8 +869,12 @@ typedef struct GTY (()) machine_function
   /* One entry for each general purpose register.  */
   rtx call_via[SP_REGNUM];
   bool label_is_assembled;
+  /* A set of all decls that have been passed to a vld1 intrinsic in the
+     current function.  This is used to help guide the vector cost model.  */
+  hash_set<tree> *vector_load_decls;
 } machine_function;
 #endif
+#endif
 
 /* Which ABI to use.  */
 enum aarch64_abi_type
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
new file mode 100644
index 0000000..6c1d2b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
@@ -0,0 +1,281 @@
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target aarch64_little_endian } } } */
+
+#include <arm_neon.h>
+
+/*
+** s32x2_1:
+**	dup	v0\.2s, w0
+**	ret
+*/
+int32x2_t
+s32x2_1 (int32_t x)
+{
+  int32_t arr[] = { x, x };
+  return vld1_s32 (arr);
+}
+
+/*
+** s32x2_2:
+**	fmov	s0, w0
+**	ret
+*/
+int32x2_t
+s32x2_2 (int32_t x)
+{
+  int32_t arr[] = { x, 0 };
+  return vld1_s32 (arr);
+}
+
+/*
+** s32x2_3:
+**	fmov	s0, w0
+**	ins	v0\.s\[1\], w1
+**	ret
+*/
+int32x2_t
+s32x2_3 (int32_t x, int32_t y)
+{
+  int32_t arr[] = { x, y };
+  return vld1_s32 (arr);
+}
+
+/*
+** f32x2_1:
+**	dup	v0\.2s, v0.s\[0\]
+**	ret
+*/
+float32x2_t
+f32x2_1 (float32_t x)
+{
+  float32_t arr[] = { x, x };
+  return vld1_f32 (arr);
+}
+
+/*
+** f32x2_2:
+**	ins	v0\.s\[1\], v1.s\[0\]
+**	ret
+*/
+float32x2_t
+f32x2_2 (float32_t x, float32_t y)
+{
+  float32_t arr[] = { x, y };
+  return vld1_f32 (arr);
+}
+
+/*
+** s16x4_1:
+**	dup	v0\.4h, w0
+**	ret
+*/
+int16x4_t
+s16x4_1 (int16_t x)
+{
+  int16_t arr[] = { x, x, x, x };
+  return vld1_s16 (arr);
+}
+
+/*
+** s16x4_2:
+**	...
+**	fmov	[dsh]0, [wx][0-9]+
+**	ret
+*/
+int16x4_t
+s16x4_2 (int16_t x)
+{
+  int16_t arr[] = { x, 0, 0, 0 };
+  return vld1_s16 (arr);
+}
+
+/*
+** s16x4_3:
+**	dup	v0\.4h, w1
+**	ins	v0.h\[0\], w0
+**	ret
+*/
+int16x4_t
+s16x4_3 (int16_t x, int16_t y)
+{
+  int16_t arr[] = { x, y, y, y };
+  return vld1_s16 (arr);
+}
+
+/*
+** f16x4_1:
+**	dup	v0\.4h, v0.h\[0\]
+**	ret
+*/
+float16x4_t
+f16x4_1 (float16_t x)
+{
+  float16_t arr[] = { x, x, x, x };
+  return vld1_f16 (arr);
+}
+
+/*
+** s64x2_1:
+**	dup	v0\.2d, x0
+**	ret
+*/
+int64x2_t
+s64x2_1 (int64_t x)
+{
+  int64_t arr[] = { x, x };
+  return vld1q_s64 (arr);
+}
+
+/*
+** s64x2_2: { xfail *-*-* }
+**	fmov	d0, x0
+**	ret
+*/
+int64x2_t
+s64x2_2 (int64_t x)
+{
+  int64_t arr[] = { x, 0 };
+  return vld1q_s64 (arr);
+}
+
+/*
+** s64x2_3:
+**	fmov	d0, x0
+**	ins	v0\.d\[1\], x1
+**	ret
+*/
+int64x2_t
+s64x2_3 (int64_t x, int64_t y)
+{
+  int64_t arr[] = { x, y };
+  return vld1q_s64 (arr);
+}
+
+/*
+** f64x2_1:
+**	dup	v0\.2d, v0.d\[0\]
+**	ret
+*/
+float64x2_t
+f64x2_1 (float64_t x)
+{
+  float64_t arr[] = { x, x };
+  return vld1q_f64 (arr);
+}
+
+/*
+** f64x2_2:
+**	ins	v0\.d\[1\], v1.d\[0\]
+**	ret
+*/
+float64x2_t
+f64x2_2 (float64_t x, float64_t y)
+{
+  float64_t arr[] = { x, y };
+  return vld1q_f64 (arr);
+}
+
+/*
+** s32x4_1:
+**	dup	v0\.4s, w0
+**	ret
+*/
+int32x4_t
+s32x4_1 (int32_t x)
+{
+  int32_t arr[] = { x, x, x, x };
+  return vld1q_s32 (arr);
+}
+
+/*
+** s32x4_2: { xfail *-*-* }
+**	fmov	s0, w0
+**	ret
+*/
+int32x4_t
+s32x4_2 (int32_t x)
+{
+  int32_t arr[] = { x, 0, 0, 0 };
+  return vld1q_s32 (arr);
+}
+
+/*
+** s32x4_3:
+**	dup	v0\.4s, w1
+**	ins	v0.s\[0\], w0
+**	ret
+*/
+int32x4_t
+s32x4_3 (int32_t x, int32_t y)
+{
+  int32_t arr[] = { x, y, y, y };
+  return vld1q_s32 (arr);
+}
+
+/*
+** f32x4_1:
+**	dup	v0\.4s, v0.s\[0\]
+**	ret
+*/
+float32x4_t
+f32x4_1 (float32_t x)
+{
+  float32_t arr[] = { x, x, x, x };
+  return vld1q_f32 (arr);
+}
+
+void consume (float32x4_t, float32x4_t, float32x4_t, float32x4_t);
+
+/*
+** produce_1:
+** (
+**	dup	v0\.4s, v0\.s\[0\]
+**	dup	v1\.4s, v1\.s\[0\]
+**	dup	v2\.4s, v2\.s\[0\]
+**	dup	v3\.4s, v3\.s\[0\]
+** |
+**	dup	v3\.4s, v3\.s\[0\]
+**	dup	v2\.4s, v2\.s\[0\]
+**	dup	v1\.4s, v1\.s\[0\]
+**	dup	v0\.4s, v0\.s\[0\]
+** )
+**	b	consume
+*/
+void
+produce_1 (float32_t a, float32_t b, float32_t c, float32_t d)
+{
+  float arr[4][4] = {
+    { a, a, a, a },
+    { b, b, b, b },
+    { c, c, c, c },
+    { d, d, d, d }
+  };
+  consume (vld1q_f32 (arr[0]), vld1q_f32 (arr[1]),
+	   vld1q_f32 (arr[2]), vld1q_f32 (arr[3]));
+}
+
+/*
+** produce_2:
+** (
+**	dup	v0\.4s, v0\.s\[0\]
+**	dup	v1\.4s, v1\.s\[0\]
+**	dup	v2\.4s, v2\.s\[0\]
+**	dup	v3\.4s, v3\.s\[0\]
+** |
+**	dup	v3\.4s, v3\.s\[0\]
+**	dup	v2\.4s, v2\.s\[0\]
+**	dup	v1\.4s, v1\.s\[0\]
+**	dup	v0\.4s, v0\.s\[0\]
+** )
+**	b	consume
+*/
+void
+produce_2 (float32_t a, float32_t b, float32_t c, float32_t d)
+{
+  float arr0[] = { a, a, a, a };
+  float arr1[] = { b, b, b, b };
+  float arr2[] = { c, c, c, c };
+  float arr3[] = { d, d, d, d };
+  consume (vld1q_f32 (arr0), vld1q_f32 (arr1),
+	   vld1q_f32 (arr2), vld1q_f32 (arr3));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_2.c b/gcc/testsuite/gcc.target/aarch64/pr109072_2.c
new file mode 100644
index 0000000..d532f08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_2.c
@@ -0,0 +1,60 @@
+/* { dg-options "-O" } */
+
+#pragma GCC target "arch=armv8.2-a+dotprod"
+
+#include <arm_neon.h>
+
+static inline uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+  return vaddvq_u32(a);
+}
+
+static inline unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           const uint8_t *second_pred) {
+
+
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                         second_pred);
+}
+
+uint32_t vpx_sad32x16_avg_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { return sad32xh_avg_neon(src, src_stride, ref, ref_stride, (16), second_pred); }