AArch64: Fold LD1/ST1 with ptrue to LDR/STR for 128-bit VLS

If -msve-vector-bits=128, SVE loads and stores (LD1 and ST1) with a ptrue predicate can be replaced by neon instructions (LDR and STR), thus avoiding the predicate altogether. This also enables formation of LDP/STP pairs. For example, the test cases svfloat64_t ptrue_load (float64_t *x) { svbool_t pg = svptrue_b64 (); return svld1_f64 (pg, x); } void ptrue_store (float64_t *x, svfloat64_t data) { svbool_t pg = svptrue_b64 (); return svst1_f64 (pg, x, data); } were previously compiled to (with -O2 -march=armv8.2-a+sve -msve-vector-bits=128): ptrue_load: ptrue p3.b, vl16 ld1d z0.d, p3/z, [x0] ret ptrue_store: ptrue p3.b, vl16 st1d z0.d, p3, [x0] ret Now the are compiled to: ptrue_load: ldr q0, [x0] ret ptrue_store: str q0, [x0] ret The implementation includes the if-statement if (known_eq (GET_MODE_SIZE (mode), 16) && aarch64_classify_vector_mode (mode) == VEC_SVE_DATA) which checks for 128-bit VLS and excludes partial modes with a mode size < 128 (e.g. VNx2QI). The patch was bootstrapped and tested on aarch64-linux-gnu, no regression. OK for mainline? Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com> gcc/ * config/aarch64/aarch64.cc (aarch64_emit_sve_pred_move): Fold LD1/ST1 with ptrue to LDR/STR for 128-bit VLS. gcc/testsuite/ * gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c: New test. * gcc.target/aarch64/sve/cond_arith_6.c: Adjust expected outcome. * gcc.target/aarch64/sve/pcs/return_4_128.c: Likewise. * gcc.target/aarch64/sve/pcs/return_5_128.c: Likewise. * gcc.target/aarch64/sve/pcs/struct_3_128.c: Likewise.
author: Jennifer Schmitz <jschmitz@nvidia.com> 2025-02-13 04:34:30 -0800
committer: Jennifer Schmitz <jschmitz@nvidia.com> 2025-04-30 11:05:11 +0200
commit: 83bb288faa39a0bf5ce2d62e21a090a130d8dda4 (patch)
tree: dd7482b9b3986d6e280bedab72ec644e72eebf47
parent: cc8b8c0b69200ab816a2626e29d91ac995f7438f (diff)
download: gcc-83bb288faa39a0bf5ce2d62e21a090a130d8dda4.zip
gcc-83bb288faa39a0bf5ce2d62e21a090a130d8dda4.tar.gz
gcc-83bb288faa39a0bf5ce2d62e21a090a130d8dda4.tar.bz2
6 files changed, 118 insertions, 96 deletions
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f7bccf5..fff8d9d 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6416,13 +6416,30 @@ aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
 void
 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
 {
-  expand_operand ops[3];
   machine_mode mode = GET_MODE (dest);
-  create_output_operand (&ops[0], dest, mode);
-  create_input_operand (&ops[1], pred, GET_MODE(pred));
-  create_input_operand (&ops[2], src, mode);
-  temporary_volatile_ok v (true);
-  expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
+  if ((MEM_P (dest) || MEM_P (src))
+      && known_eq (GET_MODE_SIZE (mode), 16)
+      && aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
+      && !BYTES_BIG_ENDIAN)
+    {
+      if (MEM_P (src))
+	{
+	  rtx tmp = force_reg (V16QImode, adjust_address (src, V16QImode, 0));
+	  emit_move_insn (dest, lowpart_subreg (mode, tmp, V16QImode));
+	}
+      else
+	emit_move_insn (adjust_address (dest, V16QImode, 0),
+			force_lowpart_subreg (V16QImode, src, mode));
+    }
+  else
+    {
+      expand_operand ops[3];
+      create_output_operand (&ops[0], dest, mode);
+      create_input_operand (&ops[1], pred, GET_MODE(pred));
+      create_input_operand (&ops[2], src, mode);
+      temporary_volatile_ok v (true);
+      expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
+    }
 }
 
 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
index 4085ab1..d5a12f1 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
@@ -8,7 +8,8 @@ f (float *x)
       x[i] -= 1.0f;
 }
 
-/* { dg-final { scan-assembler {\tld1w\tz} } } */
+/* { dg-final { scan-assembler {\tld1w\tz} { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler {\tldr\tq} { target aarch64_little_endian } } } */
 /* { dg-final { scan-assembler {\tfcmgt\tp} } } */
 /* { dg-final { scan-assembler {\tfsub\tz} } } */
 /* { dg-final { scan-assembler {\tst1w\tz} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c b/gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c
new file mode 100644
index 0000000..43d36e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=128" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+#include <arm_sve.h>
+
+#define TEST(TYPE, TY, B)				\
+  sv##TYPE						\
+  ld1_##TY##B (TYPE *x)					\
+  {							\
+    svbool_t pg = svptrue_b##B ();			\
+    return svld1_##TY##B (pg, x);			\
+  }							\
+							\
+  void							\
+  st1_##TY##B (TYPE *x, sv##TYPE data)			\
+  {							\
+    svbool_t pg = svptrue_b##B ();			\
+    svst1_##TY##B (pg, x, data);			\
+  }							\
+							\
+  sv##TYPE						\
+  ld1_vol_##TY##B (volatile sv##TYPE *ptr)		\
+  {							\
+    return *ptr;					\
+  }							\
+							\
+  void							\
+  st1_vol_##TY##B (volatile sv##TYPE *ptr, sv##TYPE x)	\
+  {							\
+    *ptr = x;						\
+  }
+
+TEST (bfloat16_t, bf, 16)
+TEST (float16_t, f, 16)
+TEST (float32_t, f, 32)
+TEST (float64_t, f, 64)
+TEST (int8_t, s, 8)
+TEST (int16_t, s, 16)
+TEST (int32_t, s, 32)
+TEST (int64_t, s, 64)
+TEST (uint8_t, u, 8)
+TEST (uint16_t, u, 16)
+TEST (uint32_t, u, 32)
+TEST (uint64_t, u, 64)
+
+/* { dg-final { scan-assembler-times {\tldr\tq0, \[x0\]} 24 } } */
+/* { dg-final { scan-assembler-times {\tstr\tq0, \[x0\]} 24 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_128.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_128.c
index 87d528c..ac5f981 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_128.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_128.c
@@ -11,104 +11,91 @@
 
 /*
 ** callee_s8:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1b	z0\.b, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (s8, __SVInt8_t)
 
 /*
 ** callee_u8:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1b	z0\.b, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (u8, __SVUint8_t)
 
 /*
 ** callee_mf8:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1b	z0\.b, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (mf8, __SVMfloat8_t)
 
 /*
 ** callee_s16:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1h	z0\.h, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (s16, __SVInt16_t)
 
 /*
 ** callee_u16:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1h	z0\.h, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (u16, __SVUint16_t)
 
 /*
 ** callee_f16:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1h	z0\.h, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (f16, __SVFloat16_t)
 
 /*
 ** callee_bf16:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1h	z0\.h, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (bf16, __SVBfloat16_t)
 
 /*
 ** callee_s32:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1w	z0\.s, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (s32, __SVInt32_t)
 
 /*
 ** callee_u32:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1w	z0\.s, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (u32, __SVUint32_t)
 
 /*
 ** callee_f32:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1w	z0\.s, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (f32, __SVFloat32_t)
 
 /*
 ** callee_s64:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1d	z0\.d, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (s64, __SVInt64_t)
 
 /*
 ** callee_u64:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1d	z0\.d, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (u64, __SVUint64_t)
 
 /*
 ** callee_f64:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1d	z0\.d, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (f64, __SVFloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_128.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_128.c
index 347a16c..2fab6fe 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_128.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_128.c
@@ -13,104 +13,91 @@
 
 /*
 ** callee_s8:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1b	z0\.b, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (s8, svint8_t)
 
 /*
 ** callee_u8:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1b	z0\.b, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (u8, svuint8_t)
 
 /*
 ** callee_mf8:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1b	z0\.b, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (mf8, svmfloat8_t)
 
 /*
 ** callee_s16:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1h	z0\.h, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (s16, svint16_t)
 
 /*
 ** callee_u16:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1h	z0\.h, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (u16, svuint16_t)
 
 /*
 ** callee_f16:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1h	z0\.h, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (f16, svfloat16_t)
 
 /*
 ** callee_bf16:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1h	z0\.h, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (bf16, svbfloat16_t)
 
 /*
 ** callee_s32:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1w	z0\.s, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (s32, svint32_t)
 
 /*
 ** callee_u32:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1w	z0\.s, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (u32, svuint32_t)
 
 /*
 ** callee_f32:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1w	z0\.s, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (f32, svfloat32_t)
 
 /*
 ** callee_s64:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1d	z0\.d, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (s64, svint64_t)
 
 /*
 ** callee_u64:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1d	z0\.d, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (u64, svuint64_t)
 
 /*
 ** callee_f64:
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1d	z0\.d, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 CALLEE (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
index d99ce12..29bdaf3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
@@ -473,17 +473,16 @@ SEL2 (struct, pst_uniform4)
 **	sub	sp, sp, #144
 **	add	(x[0-9]+), sp, #?31
 **	and	x7, \1, #?(?:-32|4294967264)
-**	ptrue	(p[0-7])\.b, vl16
-**	st1w	z0\.s, \2, \[x7\]
-**	add	(x[0-9]+), x7, #?32
+**	mov	(x[0-9]+), x7
+**	str	q0, \[\2\], 32
 ** (
-**	str	z1, \[\3\]
-**	str	z2, \[\3, #1, mul vl\]
+**	str	z1, \[\2\]
+**	str	z2, \[\2, #1, mul vl\]
 ** |
-**	stp	q1, q2, \[\3\]
+**	stp	q1, q2, \[\2\]
 ** )
-**	str	z3, \[\3, #2, mul vl\]
-**	st1w	z4\.s, \2, \[x7, #6, mul vl\]
+**	str	z3, \[\2, #2, mul vl\]
+**	str	q4, \[x7, 96\]
 **	add	sp, sp, #?144
 **	ret
 */
@@ -516,20 +515,12 @@ SEL2 (struct, pst_mixed1)
 ** test_pst_mixed1:
 **	sub	sp, sp, #176
 **	str	p0, \[sp\]
-**	ptrue	p0\.b, vl16
-**	st1h	z0\.h, p0, \[sp, #1, mul vl\]
-**	st1h	z1\.h, p0, \[sp, #2, mul vl\]
-**	st1w	z2\.s, p0, \[sp, #3, mul vl\]
-**	st1d	z3\.d, p0, \[sp, #4, mul vl\]
+**	stp	q0, q1, \[sp, 16\]
+**	stp	q2, q3, \[sp, 48\]
 **	str	p1, \[sp, #40, mul vl\]
 **	str	p2, \[sp, #41, mul vl\]
-**	st1b	z4\.b, p0, \[sp, #6, mul vl\]
-**	st1h	z5\.h, p0, \[sp, #7, mul vl\]
-**	...
-**	st1w	z6\.s, p0, [^\n]*
-**	...
-**	st1d	z7\.d, p0, [^\n]*
-**	...
+**	stp	q4, q5, \[sp, 96\]
+**	stp	q6, q7, \[sp, 128\]
 **	str	p3, \[sp, #80, mul vl\]
 **	mov	(x7, sp|w7, wsp)
 **	add	sp, sp, #?176
@@ -557,15 +548,11 @@ SEL2 (struct, pst_mixed2)
 ** test_pst_mixed2:
 **	sub	sp, sp, #128
 **	str	p0, \[sp\]
-**	ptrue	(p[03])\.b, vl16
-**	add	(x[0-9]+), sp, #?2
-**	st1b	z0\.b, \1, \[\2\]
+**	str	q0, \[sp, 2\]
 **	str	p1, \[sp, #9, mul vl\]
-**	add	(x[0-9]+), sp, #?20
-**	st1b	z1\.b, \1, \[\3\]
+**	str	q1, \[sp, 20\]
 **	str	p2, \[sp, #18, mul vl\]
-**	add	(x[0-9]+), sp, #?38
-**	st1b	z2\.b, \1, \[\4\]
+**	str	q2, \[sp, 38\]
 ** (
 **	str	z3, \[sp, #4, mul vl\]
 **	str	z4, \[sp, #5, mul vl\]
@@ -595,8 +582,7 @@ SEL2 (struct, pst_big1)
 
 /*
 ** test_pst_big1_a: { target lp64 }
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1b	z0\.b, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 /*
@@ -760,8 +746,7 @@ test_pst_big3_d (struct pst_big3 x)
 
 /*
 ** test_pst_big3_e: { target lp64 }
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1b	z0\.b, \1/z, \[x0, #1, mul vl\]
+**	ldr	q0, \[x0, 16\]
 **	ret
 */
 /*
@@ -780,8 +765,7 @@ test_pst_big3_e (struct pst_big3 x)
 
 /*
 ** test_pst_big3_f: { target lp64 }
-**	ptrue	(p[0-7])\.b, vl16
-**	ld1b	z0\.b, \1/z, \[x0, #5, mul vl\]
+**	ldr	q0, \[x0, 80\]
 **	ret
 */
 /*
@@ -1035,8 +1019,7 @@ SEL2 (struct, nonpst6)
 
 /*
 ** test_nonpst6: { target lp64 }
-**	ptrue	(p[0-3])\.b, vl16
-**	ld1d	z0\.d, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 /*
@@ -1063,8 +1046,7 @@ SEL2 (struct, nonpst7)
 
 /*
 ** test_nonpst7: { target lp64 }
-**	ptrue	(p[0-3])\.b, vl16
-**	ld1d	z0\.d, \1/z, \[x0\]
+**	ldr	q0, \[x0\]
 **	ret
 */
 /*
author	Jennifer Schmitz <jschmitz@nvidia.com>	2025-02-13 04:34:30 -0800
committer	Jennifer Schmitz <jschmitz@nvidia.com>	2025-04-30 11:05:11 +0200
commit	83bb288faa39a0bf5ce2d62e21a090a130d8dda4 (patch)
tree	dd7482b9b3986d6e280bedab72ec644e72eebf47
parent	cc8b8c0b69200ab816a2626e29d91ac995f7438f (diff)
download	gcc-83bb288faa39a0bf5ce2d62e21a090a130d8dda4.zip gcc-83bb288faa39a0bf5ce2d62e21a090a130d8dda4.tar.gz gcc-83bb288faa39a0bf5ce2d62e21a090a130d8dda4.tar.bz2