7 files changed, 30 insertions, 20 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d665f3d..36d3f50 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,14 @@
 2018-02-01  Richard Sandiford  <richard.sandiford@linaro.org>
 
+	* config/aarch64/aarch64-sve.md (sve_ld1rq): Replace with...
+	(*sve_ld1rq<Vesize>): ... this new pattern.  Handle all element sizes,
+	not just bytes.
+	* config/aarch64/aarch64.c (aarch64_expand_sve_widened_duplicate):
+	Remove BSWAP handing for big-endian targets and use the form of
+	LD1RQ appropariate for the mode.
+
+2018-02-01  Richard Sandiford  <richard.sandiford@linaro.org>
+
 	* config/aarch64/aarch64.c (aarch64_simd_valid_immediate): Handle
 	all CONST_VECTOR_DUPLICATE_P vectors, not just those with a single
 	duplicated element.
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index ee942df..068fd8c 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -652,14 +652,14 @@
 ;; Load 128 bits from memory and duplicate to fill a vector.  Since there
 ;; are so few operations on 128-bit "elements", we don't define a VNx1TI
 ;; and simply use vectors of bytes instead.
-(define_insn "sve_ld1rq"
-  [(set (match_operand:VNx16QI 0 "register_operand" "=w")
-	(unspec:VNx16QI
-	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+(define_insn "*sve_ld1rq<Vesize>"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
 	   (match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
 	  UNSPEC_LD1RQ))]
   "TARGET_SVE"
-  "ld1rqb\t%0.b, %1/z, %2"
+  "ld1rq<Vesize>\t%0.<Vetype>, %1/z, %2"
 )
 
 ;; Implement a predicate broadcast by shifting the low bit of the scalar
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1278f83..ae142b4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2787,16 +2787,7 @@ aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
       return true;
     }
 
-  /* The bytes are loaded in little-endian order, so do a byteswap on
-     big-endian targets.  */
-  if (BYTES_BIG_ENDIAN)
-    {
-      src = simplify_unary_operation (BSWAP, src_mode, src, src_mode);
-      if (!src)
-	return NULL_RTX;
-    }
-
-  /* Use LD1RQ to load the 128 bits from memory.  */
+  /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
   src = force_const_mem (src_mode, src);
   if (!src)
     return false;
@@ -2808,8 +2799,12 @@ aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
       src = replace_equiv_address (src, addr);
     }
 
-  rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
-  emit_insn (gen_sve_ld1rq (gen_lowpart (VNx16QImode, dest), ptrue, src));
+  machine_mode mode = GET_MODE (dest);
+  unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
+  machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
+  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
+  src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
+  emit_insn (gen_rtx_SET (dest, src));
   return true;
 }
 
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 92d013e..f440d1e4 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2018-02-01  Richard Sandiford  <richard.sandiford@linaro.org>
+
+	* gcc.target/aarch64/sve/slp_2.c: Expect LD1RQD rather than LD1RQB.
+	* gcc.target/aarch64/sve/slp_3.c: Expect LD1RQW rather than LD1RQB.
+	* gcc.target/aarch64/sve/slp_4.c: Expect LD1RQH rather than LD1RQB.
+
 2018-02-01  Jakub Jelinek  <jakub@redhat.com>
 
 	PR tree-optimization/81661
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
index 4a219f2..657abb0 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
@@ -32,7 +32,7 @@ TEST_ALL (VEC_PERM)
 /* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 } } */
 /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 3 } } */
 /* { dg-final { scan-assembler-not {\tzip1\t} } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
index cfe20a8..dd47502 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
@@ -36,7 +36,7 @@ TEST_ALL (VEC_PERM)
 /* 1 for each 16-bit type and 4 for double.  */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 } } */
 /* 1 for each 32-bit type.  */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
index 98ff68f..026fa8c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
@@ -38,7 +38,7 @@ TEST_ALL (VEC_PERM)
 /* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double.  */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 } } */
 /* 1 for each 16-bit type.  */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]\.h, } 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #11\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #17\n} 2 } } */