[AArch64] Use all SVE LD1RQ variants

The fallback way of handling a repeated 128-bit constant vector for SVE is to force the 128 bits to the constant pool and use LD1RQ to load it. Previously the code always used the byte variant of LD1RQ (LD1RQB), with a preceding BSWAP for big-endian targets. However, that BSWAP doesn't handle all cases correctly. The simplest fix seemed to be to use the LD1RQ appropriate for the element size. This helps to fix some of the sve/slp_*.c tests for aarch64_be, although a later patch is needed as well. 2018-02-01 Richard Sandiford <richard.sandiford@linaro.org> gcc/ * config/aarch64/aarch64-sve.md (sve_ld1rq): Replace with... (*sve_ld1rq<Vesize>): ... this new pattern. Handle all element sizes, not just bytes. * config/aarch64/aarch64.c (aarch64_expand_sve_widened_duplicate): Remove BSWAP handing for big-endian targets and use the form of LD1RQ appropariate for the mode. gcc/testsuite/ * gcc.target/aarch64/sve/slp_2.c: Expect LD1RQD rather than LD1RQB. * gcc.target/aarch64/sve/slp_3.c: Expect LD1RQW rather than LD1RQB. * gcc.target/aarch64/sve/slp_4.c: Expect LD1RQH rather than LD1RQB. Reviewed-by: James Greenhalgh <james.greenhalgh@arm.com> From-SVN: r257287
author: Richard Sandiford <richard.sandiford@linaro.org> 2018-02-01 11:03:36 +0000
committer: Richard Sandiford <rsandifo@gcc.gnu.org> 2018-02-01 11:03:36 +0000
commit: 947b137212d16d432eec201fe7f800dfdb481203 (patch)
tree: 115e9e9973ba12da75e94cd4bf7582ef41c8db9a
parent: f9093f23517498f99768816fb11849b798db85e9 (diff)
download: gcc-947b137212d16d432eec201fe7f800dfdb481203.zip
gcc-947b137212d16d432eec201fe7f800dfdb481203.tar.gz
gcc-947b137212d16d432eec201fe7f800dfdb481203.tar.bz2
7 files changed, 30 insertions, 20 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d665f3d..36d3f50 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,14 @@
 2018-02-01  Richard Sandiford  <richard.sandiford@linaro.org>
 
+	* config/aarch64/aarch64-sve.md (sve_ld1rq): Replace with...
+	(*sve_ld1rq<Vesize>): ... this new pattern.  Handle all element sizes,
+	not just bytes.
+	* config/aarch64/aarch64.c (aarch64_expand_sve_widened_duplicate):
+	Remove BSWAP handing for big-endian targets and use the form of
+	LD1RQ appropariate for the mode.
+
+2018-02-01  Richard Sandiford  <richard.sandiford@linaro.org>
+
 	* config/aarch64/aarch64.c (aarch64_simd_valid_immediate): Handle
 	all CONST_VECTOR_DUPLICATE_P vectors, not just those with a single
 	duplicated element.
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index ee942df..068fd8c 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -652,14 +652,14 @@
 ;; Load 128 bits from memory and duplicate to fill a vector.  Since there
 ;; are so few operations on 128-bit "elements", we don't define a VNx1TI
 ;; and simply use vectors of bytes instead.
-(define_insn "sve_ld1rq"
-  [(set (match_operand:VNx16QI 0 "register_operand" "=w")
-	(unspec:VNx16QI
-	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+(define_insn "*sve_ld1rq<Vesize>"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
 	   (match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
 	  UNSPEC_LD1RQ))]
   "TARGET_SVE"
-  "ld1rqb\t%0.b, %1/z, %2"
+  "ld1rq<Vesize>\t%0.<Vetype>, %1/z, %2"
 )
 
 ;; Implement a predicate broadcast by shifting the low bit of the scalar
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1278f83..ae142b4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2787,16 +2787,7 @@ aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
       return true;
     }
 
-  /* The bytes are loaded in little-endian order, so do a byteswap on
-     big-endian targets.  */
-  if (BYTES_BIG_ENDIAN)
-    {
-      src = simplify_unary_operation (BSWAP, src_mode, src, src_mode);
-      if (!src)
-	return NULL_RTX;
-    }
-
-  /* Use LD1RQ to load the 128 bits from memory.  */
+  /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
   src = force_const_mem (src_mode, src);
   if (!src)
     return false;
@@ -2808,8 +2799,12 @@ aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
       src = replace_equiv_address (src, addr);
     }
 
-  rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
-  emit_insn (gen_sve_ld1rq (gen_lowpart (VNx16QImode, dest), ptrue, src));
+  machine_mode mode = GET_MODE (dest);
+  unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
+  machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
+  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
+  src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
+  emit_insn (gen_rtx_SET (dest, src));
   return true;
 }
 
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 92d013e..f440d1e4 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2018-02-01  Richard Sandiford  <richard.sandiford@linaro.org>
+
+	* gcc.target/aarch64/sve/slp_2.c: Expect LD1RQD rather than LD1RQB.
+	* gcc.target/aarch64/sve/slp_3.c: Expect LD1RQW rather than LD1RQB.
+	* gcc.target/aarch64/sve/slp_4.c: Expect LD1RQH rather than LD1RQB.
+
 2018-02-01  Jakub Jelinek  <jakub@redhat.com>
 
 	PR tree-optimization/81661
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
index 4a219f2..657abb0 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
@@ -32,7 +32,7 @@ TEST_ALL (VEC_PERM)
 /* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 } } */
 /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 3 } } */
 /* { dg-final { scan-assembler-not {\tzip1\t} } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
index cfe20a8..dd47502 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
@@ -36,7 +36,7 @@ TEST_ALL (VEC_PERM)
 /* 1 for each 16-bit type and 4 for double.  */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 } } */
 /* 1 for each 32-bit type.  */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
index 98ff68f..026fa8c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
@@ -38,7 +38,7 @@ TEST_ALL (VEC_PERM)
 /* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double.  */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 } } */
 /* 1 for each 16-bit type.  */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]\.h, } 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #11\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #17\n} 2 } } */
author	Richard Sandiford <richard.sandiford@linaro.org>	2018-02-01 11:03:36 +0000
committer	Richard Sandiford <rsandifo@gcc.gnu.org>	2018-02-01 11:03:36 +0000
commit	947b137212d16d432eec201fe7f800dfdb481203 (patch)
tree	115e9e9973ba12da75e94cd4bf7582ef41c8db9a
parent	f9093f23517498f99768816fb11849b798db85e9 (diff)
download	gcc-947b137212d16d432eec201fe7f800dfdb481203.zip gcc-947b137212d16d432eec201fe7f800dfdb481203.tar.gz gcc-947b137212d16d432eec201fe7f800dfdb481203.tar.bz2