[AArch64] Unify vec_set patterns, support floating-point vector modes properly

We've a deficiency in our vec_set family of patterns. We don't support directly loading a vector lane using LD1 for V2DImode and all the vector floating-point modes. We do do it correctly for the other integer vector modes (V4SI, V8HI etc) though. The alternatives on the relative floating-point patterns only allow a register-to-register INS instruction. That means if we want to load a value into a vector lane we must first load it into a scalar register and then perform an INS, which is wasteful. There is also an explicit V2DI vec_set expander dangling around for no reason that I can see. It seems to do the exact same things as the other vec_set expanders. This patch removes that. It now unifies all vec_set expansions into a single "vec_set<mode>" define_expand using the catch-all VALL_F16 iterator. With this patch we avoid loading values into scalar registers and then doing an explicit INS on them to move them into the desired vector lanes. For example for: typedef float v4sf __attribute__ ((vector_size (16))); typedef long long v2di __attribute__ ((vector_size (16))); v2di foo_v2di (long long *a, long long *b) { v2di res = { *a, *b }; return res; } v4sf foo_v4sf (float *a, float *b, float *c, float *d) { v4sf res = { *a, *b, *c, *d }; return res; } we currently generate: foo_v2di: ldr d0, [x0] ldr x0, [x1] ins v0.d[1], x0 ret foo_v4sf: ldr s0, [x0] ldr s3, [x1] ldr s2, [x2] ldr s1, [x3] ins v0.s[1], v3.s[0] ins v0.s[2], v2.s[0] ins v0.s[3], v1.s[0] ret but with this patch we generate the much cleaner: foo_v2di: ldr d0, [x0] ld1 {v0.d}[1], [x1] ret foo_v4sf: ldr s0, [x0] ld1 {v0.s}[1], [x1] ld1 {v0.s}[2], [x2] ld1 {v0.s}[3], [x3] ret * config/aarch64/aarch64-simd.md (vec_set<mode>): Use VALL_F16 mode iterator. Delete separate integer-mode vec_set<mode> expander. (aarch64_simd_vec_setv2di): Delete. (vec_setv2di): Delete. (aarch64_simd_vec_set<mode>): Delete all other patterns with that name. Use VALL_F16 mode iterator. Add LD1 alternative and use vwcore for the "w, r" alternative. * gcc.target/aarch64/vect-init-ld1.c: New test. From-SVN: r260351
author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2018-05-18 08:52:30 +0000
committer: Kyrylo Tkachov <ktkachov@gcc.gnu.org> 2018-05-18 08:52:30 +0000
commit: 8364e58b5aadf9cac76126ed9535ce0332ac81bb (patch)
tree: 504a57ed922771ceaf234ead46813afaf4087212
parent: eb63c01f65d475f7f05d1979f66c1c41faa61da9 (diff)
download: gcc-8364e58b5aadf9cac76126ed9535ce0332ac81bb.zip
gcc-8364e58b5aadf9cac76126ed9535ce0332ac81bb.tar.gz
gcc-8364e58b5aadf9cac76126ed9535ce0332ac81bb.tar.bz2
4 files changed, 92 insertions, 76 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 8215995..59f4508 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,13 @@
+2018-05-18  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+	* config/aarch64/aarch64-simd.md (vec_set<mode>): Use VALL_F16 mode
+	iterator.  Delete separate integer-mode vec_set<mode> expander.
+	(aarch64_simd_vec_setv2di): Delete.
+	(vec_setv2di): Delete.
+	(aarch64_simd_vec_set<mode>): Delete all other patterns with that name.
+	Use VALL_F16 mode iterator.  Add LD1 alternative and use vwcore for
+	the "w, r" alternative.
+
 2018-05-18  Martin Liska  <mliska@suse.cz>
 
 	* passes.def: Add pass_lower_switch and pass_lower_switch_O0.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9cfd4d3..2ebd256 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -694,11 +694,11 @@
 )
 
 (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w,w,w")
-        (vec_merge:VDQ_BHSI
-	    (vec_duplicate:VDQ_BHSI
-		(match_operand:<VEL> 1 "aarch64_simd_general_operand" "r,w,Utv"))
-	    (match_operand:VDQ_BHSI 3 "register_operand" "0,0,0")
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
+	(vec_merge:VALL_F16
+	    (vec_duplicate:VALL_F16
+		(match_operand:<VEL> 1 "aarch64_simd_general_operand" "w,?r,Utv"))
+	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
 	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD"
   {
@@ -707,16 +707,16 @@
    switch (which_alternative)
      {
      case 0:
-	return "ins\\t%0.<Vetype>[%p2], %w1";
-     case 1:
 	return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
+     case 1:
+	return "ins\\t%0.<Vetype>[%p2], %<vwcore>1";
      case 2:
         return "ld1\\t{%0.<Vetype>}[%p2], %1";
      default:
 	gcc_unreachable ();
      }
   }
-  [(set_attr "type" "neon_from_gp<q>, neon_ins<q>, neon_load1_one_lane<q>")]
+  [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )
 
 (define_insn "*aarch64_simd_vec_copy_lane<mode>"
@@ -1030,19 +1030,6 @@
   }
 )
 
-(define_expand "vec_set<mode>"
-  [(match_operand:VDQ_BHSI 0 "register_operand")
-   (match_operand:<VEL> 1 "register_operand")
-   (match_operand:SI 2 "immediate_operand")]
-  "TARGET_SIMD"
-  {
-    HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
-    emit_insn (gen_aarch64_simd_vec_set<mode> (operands[0], operands[1],
-					    GEN_INT (elem), operands[0]));
-    DONE;
-  }
-)
-
 ;; For 64-bit modes we use ushl/r, as this does not require a SIMD zero.
 (define_insn "vec_shr_<mode>"
   [(set (match_operand:VD 0 "register_operand" "=w")
@@ -1059,62 +1046,8 @@
   [(set_attr "type" "neon_shift_imm")]
 )
 
-(define_insn "aarch64_simd_vec_setv2di"
-  [(set (match_operand:V2DI 0 "register_operand" "=w,w")
-        (vec_merge:V2DI
-	    (vec_duplicate:V2DI
-		(match_operand:DI 1 "register_operand" "r,w"))
-	    (match_operand:V2DI 3 "register_operand" "0,0")
-	    (match_operand:SI 2 "immediate_operand" "i,i")))]
-  "TARGET_SIMD"
-  {
-    int elt = ENDIAN_LANE_N (2, exact_log2 (INTVAL (operands[2])));
-    operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
-    switch (which_alternative)
-      {
-      case 0:
-	return "ins\\t%0.d[%p2], %1";
-      case 1:
-        return "ins\\t%0.d[%p2], %1.d[0]";
-      default:
-	gcc_unreachable ();
-      }
-  }
-  [(set_attr "type" "neon_from_gp, neon_ins_q")]
-)
-
-(define_expand "vec_setv2di"
-  [(match_operand:V2DI 0 "register_operand")
-   (match_operand:DI 1 "register_operand")
-   (match_operand:SI 2 "immediate_operand")]
-  "TARGET_SIMD"
-  {
-    HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
-    emit_insn (gen_aarch64_simd_vec_setv2di (operands[0], operands[1],
-					  GEN_INT (elem), operands[0]));
-    DONE;
-  }
-)
-
-(define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
-	(vec_merge:VDQF_F16
-	    (vec_duplicate:VDQF_F16
-		(match_operand:<VEL> 1 "register_operand" "w"))
-	    (match_operand:VDQF_F16 3 "register_operand" "0")
-	    (match_operand:SI 2 "immediate_operand" "i")))]
-  "TARGET_SIMD"
-  {
-    int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
-
-    operands[2] = GEN_INT ((HOST_WIDE_INT)1 << elt);
-    return "ins\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
-  }
-  [(set_attr "type" "neon_ins<q>")]
-)
-
 (define_expand "vec_set<mode>"
-  [(match_operand:VDQF_F16 0 "register_operand" "+w")
+  [(match_operand:VALL_F16 0 "register_operand" "+w")
    (match_operand:<VEL> 1 "register_operand" "w")
    (match_operand:SI 2 "immediate_operand" "")]
   "TARGET_SIMD"
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 765e08c..840d9a0 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2018-05-18  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+	* gcc.target/aarch64/vect-init-ld1.c: New test.
+
 2018-05-18  Martin Liska  <mliska@suse.cz>
 
 	* gcc.dg/tree-ssa/vrp104.c: Adjust dump file that is scanned.
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-ld1.c b/gcc/testsuite/gcc.target/aarch64/vect-init-ld1.c
new file mode 100644
index 0000000..d7ff9af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-ld1.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef long long v2di __attribute__ ((vector_size (16)));
+
+typedef __fp16 v4hf __attribute__ ((vector_size (8)));
+typedef __fp16 v8hf __attribute__ ((vector_size (16)));
+typedef float v2sf __attribute__ ((vector_size (8)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+#define FUNC2(T, IT)    \
+T                       \
+foo_##T (IT *a, IT *b)  \
+{                       \
+  T res = { *a, *b };   \
+  return res;           \
+}
+
+FUNC2(v2di, long long)
+FUNC2(v2si, int)
+FUNC2(v2df, double)
+FUNC2(v2sf, float)
+
+#define FUNC4(T, IT)    \
+T                       \
+foo_##T (IT *a, IT *b, IT *c, IT *d)    \
+{                                       \
+  T res = { *a, *b, *c, *d };           \
+  return res;                           \
+}
+
+FUNC4(v4si, int)
+FUNC4(v4hi, short)
+FUNC4(v4sf, float)
+FUNC4(v4hf, __fp16)
+
+#define FUNC8(T, IT)    \
+T                       \
+foo_##T (IT *a, IT *b, IT *c, IT *d, IT *e, IT *f, IT *g, IT *h)        \
+{                                                                       \
+  T res = { *a, *b, *c, *d, *e, *f, *g, *h };                           \
+  return res;                                                           \
+}
+
+FUNC8(v8hi, short)
+FUNC8(v8qi, char)
+FUNC8(v8hf, __fp16)
+
+
+v16qi
+foo_v16qi (char *a, char *a1, char *a2, char *a3, char *a4, char *a5,
+     char *a6, char *a7, char *a8, char *a9, char *a10, char *a11, char *a12,
+     char *a13, char *a14, char *a15)
+{
+  v16qi res = { *a, *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8, *a9, *a10,
+               *a11, *a12, *a13, *a14, *a15 };
+  return res;
+}
+
+/* { dg-final { scan-assembler-not "ld1r\t" } } */
+/* { dg-final { scan-assembler-not "dup\t" } } */
+/* { dg-final { scan-assembler-not "ins\t" } } */
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2018-05-18 08:52:30 +0000
committer	Kyrylo Tkachov <ktkachov@gcc.gnu.org>	2018-05-18 08:52:30 +0000
commit	8364e58b5aadf9cac76126ed9535ce0332ac81bb (patch)
tree	504a57ed922771ceaf234ead46813afaf4087212
parent	eb63c01f65d475f7f05d1979f66c1c41faa61da9 (diff)
download	gcc-8364e58b5aadf9cac76126ed9535ce0332ac81bb.zip gcc-8364e58b5aadf9cac76126ed9535ce0332ac81bb.tar.gz gcc-8364e58b5aadf9cac76126ed9535ce0332ac81bb.tar.bz2