aboutsummaryrefslogtreecommitdiff
path: root/gcc/testsuite/gcc.target
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/testsuite/gcc.target')
-rw-r--r--gcc/testsuite/gcc.target/arm/lob.h128
-rw-r--r--gcc/testsuite/gcc.target/arm/lob1.c23
-rw-r--r--gcc/testsuite/gcc.target/arm/lob6.c8
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c146
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c749
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c46
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c44
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c31
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c45
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c31
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c48
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c28
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c44
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c32
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c521
15 files changed, 1897 insertions, 27 deletions
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
index feaae7c..3941fe7 100644
--- a/gcc/testsuite/gcc.target/arm/lob.h
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -1,15 +1,131 @@
#include <string.h>
-
+#include <stdint.h>
/* Common code for lob tests. */
#define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
-#define N 10000
+#define N 100
+
+static void
+reset_data (int *a, int *b, int *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data64 (int64_t *a, int64_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+check_plus (int *a, int *b, int *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
static void
-reset_data (int *a, int *b, int *c)
+check_memcpy64 (int64_t *a, int64_t *c, int x)
{
- memset (a, -1, N * sizeof (*a));
- memset (b, -1, N * sizeof (*b));
- memset (c, -1, N * sizeof (*c));
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != a[i]) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
}
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
index ba5c82c..c8ce653 100644
--- a/gcc/testsuite/gcc.target/arm/lob1.c
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c)
} while (i < N);
}
-void
-check (int *a, int *b, int *c)
-{
- for (int i = 0; i < N; i++)
- {
- NO_LOB;
- if (c[i] != a[i] + b[i])
- abort ();
- }
-}
-
int
main (void)
{
- reset_data (a, b, c);
+ reset_data (a, b, c, N);
loop1 (a, b ,c);
- check (a, b ,c);
- reset_data (a, b, c);
+ check_plus (a, b, c, N);
+ reset_data (a, b, c, N);
loop2 (a, b ,c);
- check (a, b ,c);
- reset_data (a, b, c);
+ check_plus (a, b, c, N);
+ reset_data (a, b, c, N);
loop3 (a, b ,c);
- check (a, b ,c);
+ check_plus (a, b, c, N);
return 0;
}
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
index 17b6124..4fe116e 100644
--- a/gcc/testsuite/gcc.target/arm/lob6.c
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -79,14 +79,14 @@ check (void)
int
main (void)
{
- reset_data (a1, b1, c1);
- reset_data (a2, b2, c2);
+ reset_data (a1, b1, c1, N);
+ reset_data (a2, b2, c2, N);
loop1 (a1, b1, c1);
ref1 (a2, b2, c2);
check ();
- reset_data (a1, b1, c1);
- reset_data (a2, b2, c2);
+ reset_data (a1, b1, c1, N);
+ reset_data (a2, b2, c2, N);
loop2 (a1, b1, c1);
ref2 (a2, b2, c2);
check ();
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
new file mode 100644
index 0000000..6e6da3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
@@ -0,0 +1,146 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+#define IMM 5
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##_t *a, TYPE##BITS##_t *b, TYPE##BITS##_t *c, int n) \
+{ \
+ while (n > 0) \
+ { \
+ mve_pred16_t p = vctp##BITS##q (n); \
+ TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \
+ TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p); \
+ TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (va, vb, p); \
+ vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \
+ c += LANES; \
+ a += LANES; \
+ b += LANES; \
+ n -= LANES; \
+ } \
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY(BITS, LANES, LDRSTRYTPE, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY(NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (8, 16, b, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (16, 8, h, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vhaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vorrq, _x)
+
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a, TYPE##BITS##_t *b, TYPE##BITS##_t *c, int n) \
+{ \
+ while (n > 0) \
+ { \
+ mve_pred16_t p = vctp##BITS##q (n); \
+ TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \
+ TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p); \
+ TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (__inactive, va, vb, p); \
+ vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \
+ c += LANES; \
+ a += LANES; \
+ b += LANES; \
+ n -= LANES; \
+ } \
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M(BITS, LANES, LDRSTRYTPE, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M(NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (8, 16, b, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (16, 8, h, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vhaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vorrq, _m)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##_t *a, TYPE##BITS##_t *c, int n) \
+{ \
+ while (n > 0) \
+ { \
+ mve_pred16_t p = vctp##BITS##q (n); \
+ TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \
+ TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (va, IMM, p); \
+ vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \
+ c += LANES; \
+ a += LANES; \
+ n -= LANES; \
+ } \
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N(BITS, LANES, LDRSTRYTPE, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N(NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (8, 16, b, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (16, 8, h, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vhaddq, _x)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vbrsrq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshlq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshrq, _x)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a, TYPE##BITS##_t *c, int n) \
+{ \
+ while (n > 0) \
+ { \
+ mve_pred16_t p = vctp##BITS##q (n); \
+ TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \
+ TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (__inactive, va, IMM, p); \
+ vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \
+ c += LANES; \
+ a += LANES; \
+ n -= LANES; \
+ } \
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N(NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (8, 16, b, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (16, 8, h, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vhaddq, _m)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vbrsrq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshlq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshrq, _m)
+
+/* The final number of DLSTPs currently is calculated by the number of
+ `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6. */
+/* { dg-final { scan-assembler-times {\tdlstp} 144 } } */
+/* { dg-final { scan-assembler-times {\tletp} 144 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
new file mode 100644
index 0000000..84f4a2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
@@ -0,0 +1,749 @@
+
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps -fno-schedule-insns2 " } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-mtune=cortex-m55" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_mve.h>
+/* Using a >=1 condition. */
+void test1 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n >= 1)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+/*
+** test1:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Test a for loop format of decrementing to zero */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+ for (int i = num_elems; i > 0; i-= 4)
+ {
+ mve_pred16_t p = vctp32q (i);
+ int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+ vstrwq_p_s32 (b + i, va, p);
+ }
+}
+/*
+** test2:
+**...
+** dlstp.32 lr, r1
+**...
+** vldrw.32 (q[0-9]+), \[r3\], #-16
+** vstrw.32 \1, \[r0\], #-16
+** letp lr, .*
+**...
+*/
+
+/* Iteration counter counting up to num_iter. */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 0; i < num_iter; i++)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+}
+
+/*
+** test3:
+**...
+** dlstp.8 lr, r3
+**...
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrb.8 \3, \[(r[0-9]+|ip)\]
+**...
+** letp lr, .*
+**...
+*/
+
+/* Iteration counter counting down from num_iter. */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = num_iter; i > 0; i--)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+}
+/*
+** test4:
+**...
+** dlstp.8 lr, r3
+**...
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrb.8 \3, \[(r[0-9]+|ip)\]
+**...
+** letp lr, .*
+**...
+*/
+
+/* Using an unpredicated arithmetic instruction within the loop. */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ /* Is affected by implicit predication, because vb also
+ came from an unpredicated load, but there is no functional
+ problem, because the result is used in a predicated store. */
+ uint8x16_t vc = vaddq_u8 (va, vb);
+ uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ vstrbq_p_u8 (d, vd, p);
+ n-=16;
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+}
+
+/*
+** test5:
+**...
+** dlstp.8 lr, r[0-9]+
+**...
+** vldrb.8 q[0-9]+, \[r1\]
+** vldrb.8 q[0-9]+, \[r2\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+**...
+** vstrb.8 \1, \[r2\]
+** vstrb.8 \1, \[r3\]
+** letp lr, .*
+**...
+*/
+
+/* Using a different VPR value for one instruction in the loop. */
+void test6 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test6:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using another VPR value in the loop, with a vctp.
+ The doloop logic will always try to do the transform on the first
+ vctp it encounters, so this is still expected to work. */
+void test7 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p1 = vctp32q (g);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+/*
+** test7:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp,
+ but this time the p1 will also change in every loop (still fine) */
+void test8 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p1 = vctp32q (g);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ g++;
+ }
+}
+
+/*
+** test8:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vctp.32 r4
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+**...
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp_m
+ that is independent of the loop vctp VPR. */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p2 = vctp32q_m (n, p1);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test9:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vmsr p0, (r[0-9]+) @ movhi
+** vpst
+** vctpt.32 r3
+** vmrs (r[0-9]+), p0 @ movhi
+** vmsr p0, \1 @ movhi
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+** vmsr p0, \2 @ movhi
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+**...
+** vstrw.32 \3, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop,
+ with a vctp_m that is tied to the base vctp VPR. This
+ is still fine, because the vctp_m will be transformed
+ into a vctp and be implicitly predicated. */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p1 = vctp32q_m (n, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+/*
+ We don't need that extra vctp in the loop, but we currently do not optimize
+ it away, however, it is not wrong to use it...
+*/
+/*
+** test10:
+**...
+** dlstp.32 lr, r3
+** vctp.32 r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+**...
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp. */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ mve_pred16_t p1 = vcmpeqq_s32 (va, vb);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test11:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vcmp.i32 eq, q[0-9]+, q[0-9]+
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m. */
+void test12 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test12:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vmsr p0, (r[0-9]+|ip) @ movhi
+** vpst
+** vcmpt.i32 eq, q[0-9]+, q[0-9]+
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \2, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m
+ that is tied to the base vctp VPR (same as above, this will be turned
+ into a vcmp and be implicitly predicated). */
+void test13 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test13:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vcmp.i32 eq, q[0-9]+, q[0-9]+
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Similar to test27 in dsltp-invalid-asm.c, but use a predicated load to make
+ it safe to implicitly predicate the vaddv. */
+void test14 (int32_t *a, int32_t *c, int n)
+{
+ int32_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ res += vaddvq_s32 (va);
+ int32x4_t vc = vdupq_n_s32 (res);
+ vstrwq_p_s32 (c, vc, p);
+ a += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test14:
+**...
+** dlstp.32 lr, r2
+** vldrw.32 (q[0-9]+), \[r0\], #16
+** vaddv.s32 (r[0-9]+|ip), \1
+** add (r[0-9]+|ip), \3, \2
+** vdup.32 (q[0-9]+), \3
+** vstrw.32 \4, \[r1\]
+** letp lr, .*
+**...
+*/
+
+uint8_t test15 (uint8_t *a, uint8_t *b, int n)
+{
+ uint8_t res = 0;
+ uint8x16_t vc = vdupq_n_u8 (0);
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ vc = vaddq_m (vc, va, vc, p);
+ res = vgetq_lane (vc, 5);
+
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return res;
+}
+
+/*
+** test15:
+**...
+** dlstp.8 lr, r2
+**...
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+**...
+** letp lr, .*
+** vmov.u8 r[0-9]+, \2\[5\]
+**...
+*/
+
+uint8_t test16 (uint8_t *a, uint8_t *b, int n)
+{
+ uint8_t res = 0;
+ uint8x16_t vc = vdupq_n_u8 (0);
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ vc = vaddq (va, vc);
+ vc = vaddq_m (vc, va, vc, p);
+ res = vgetq_lane (vc, 5);
+
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return res;
+}
+
+/*
+** test16:
+**...
+** dlstp.8 lr, r2
+**...
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+** vadd.i8 \2, q[0-9]+, q[0-9]+
+** letp lr, .*
+** vmov.u8 r[0-9]+, \2\[5\]
+**...
+*/
+
+
+
+/* Using an across-vector unpredicated instruction in a valid way.
+ This tests that "vc" has correctly masked the risky "vb". */
+uint16_t test18 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+ res += vaddvq_u16 (vc);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/*
+** test18:
+**...
+** dlstp.16 lr, r3
+** vldrh.16 (q[0-9]+), \[r2\], #16
+** vadd.i16 \1, q[0-9]+, q[0-9]+
+** vaddv.u16 (r[0-9]+|ip), \1
+** add (r[0-9]+|ip), \3, \2
+** uxth \3, \3
+** letp lr, .*
+**...
+*/
+
+/* Using an across-vector unpredicated instruction with implicit scalar adding from outside the loop. */
+uint16_t test19 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+ res = vaddvaq_u16 (res, vc);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/*
+** test19:
+**...
+** dlstp.16 lr, r3
+** vldrh.16 (q[0-9]+), \[r2\], #16
+** vadd.i16 \1, q[0-9]+, q[0-9]+
+** vaddva.u16 (r[0-9]+|ip), \1
+** uxth \2, \2
+** letp lr, .*
+**...
+*/
+
+
+/* Using an across-vector predicated instruction in a valid way. */
+uint16_t test20 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_u16 (a);
+ res = vaddvaq_p_u16 (res, va, p);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/* The uxth could be moved outside the loop. */
+/*
+** test20:
+**...
+** dlstp.16 lr, r3
+** vldrh.16 (q[0-9]+), \[r2\], #16
+** vaddva.u16 (r[0-9]+|ip), \1
+** uxth \2, \2
+** letp lr, .*
+**...
+*/
+
+/* Using an across-vector predicated instruction in a valid way. */
+uint16_t test21 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_u16 (a);
+ res++;
+ res = vaddvaq_p_u16 (res, va, p);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/* Also think it'd be safe to move uxth outside of the loop here. */
+/*
+** test21:
+**...
+** dlstp.16 lr, r3
+** vldrh.16 (q[0-9]+), \[r2\], #16
+** adds (r[0-9]+|ip), \2, #1
+** uxth \2, \2
+** vaddva.u16 \2, \1
+** uxth \2, \2
+** letp lr, .*
+**...
+*/
+
+int test22 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ res = vmaxvq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+/*
+** test22:
+**...
+** dlstp.8 lr, r3
+**...
+** vldrb.8 (q[0-9]+), \[r[0-9]+\]
+**...
+** vmaxv.u8 (r[0-9]+|ip), \1
+** uxtb \2, \2
+** letp lr, .*
+**...
+*/
+
+int test23 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vmaxavq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+/*
+** test23:
+**...
+** dlstp.8 lr, r3
+**...
+** vldrb.8 (q[0-9]+), \[r3\]
+**...
+** vmaxav.s8 (r[0-9]+|ip), \1
+** uxtb \2, \2
+** letp lr, .*
+**...
+*/
+
+/* Like test1, but update n before vctp, meaning we should only iterate for n-4
+ elements. */
+void test24 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n >= 1)
+ {
+ n-=4;
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ }
+}
+/*
+** test24:
+**...
+** subs r3, r3, #4
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
new file mode 100644
index 0000000..c784f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
@@ -0,0 +1,46 @@
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+/* We don't support pattern recognition of signed N values when computing num_iter. */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 0; i < num_iter; i++)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+}
+
+/* Using a predicated vcmp to generate a new predicate value in the
+ loop and then using it in a predicated store insn. */
+void test17 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_s32 (va, vb);
+ mve_pred16_t p1 = vcmpeqq_m_s32 (va, vc, p);
+ vstrwq_p_s32 (c, vc, p1);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+/* This is an example of a loop that we could tail predicate but currently don't. */
+/* { dg-final { scan-assembler "letp" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
new file mode 100644
index 0000000..6966a39
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+#include "dlstp-int16x8.c"
+
+int main ()
+{
+ int i;
+ int16_t temp1[N];
+ int16_t temp2[N];
+ int16_t temp3[N];
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus16 (temp1, temp2, temp3, 0);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus16 (temp1, temp2, temp3, 1);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 7);
+ check_plus16 (temp1, temp2, temp3, 7);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 8);
+ check_plus16 (temp1, temp2, temp3, 8);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 9);
+ check_plus16 (temp1, temp2, temp3, 9);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 16);
+ check_plus16 (temp1, temp2, temp3, 16);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 17);
+ check_plus16 (temp1, temp2, temp3, 17);
+
+ reset_data16 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
new file mode 100644
index 0000000..33632c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ int16x8_t va = vldrhq_z_s16 (a, p);
+ int16x8_t vb = vldrhq_z_s16 (b, p);
+ int16x8_t vc = vaddq_x_s16 (va, vb, p);
+ vstrhq_p_s16 (c, vc, p);
+ c+=8;
+ a+=8;
+ b+=8;
+ n-=8;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.16} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
new file mode 100644
index 0000000..6833ddd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int32x4.c"
+
+int main ()
+{
+ int i;
+ int32_t temp1[N];
+ int32_t temp2[N];
+ int32_t temp3[N];
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus32 (temp1, temp2, temp3, 0);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus32 (temp1, temp2, temp3, 1);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 3);
+ check_plus32 (temp1, temp2, temp3, 3);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 4);
+ check_plus32 (temp1, temp2, temp3, 4);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 5);
+ check_plus32 (temp1, temp2, temp3, 5);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 8);
+ check_plus32 (temp1, temp2, temp3, 8);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 9);
+ check_plus32 (temp1, temp2, temp3, 9);
+
+ reset_data32 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
new file mode 100644
index 0000000..5d09f78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.32} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
new file mode 100644
index 0000000..cc0b9ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int64x2.c"
+
+int main ()
+{
+ int i;
+ int64_t temp1[N];
+ int64_t temp3[N];
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 0);
+ check_memcpy64 (temp1, temp3, 0);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 1);
+ check_memcpy64 (temp1, temp3, 1);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 2);
+ check_memcpy64 (temp1, temp3, 2);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 3);
+ check_memcpy64 (temp1, temp3, 3);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 4);
+ check_memcpy64 (temp1, temp3, 4);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 5);
+ check_memcpy64 (temp1, temp3, 5);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 6);
+ check_memcpy64 (temp1, temp3, 6);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 7);
+ check_memcpy64 (temp1, temp3, 7);
+
+ reset_data64 (temp1, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
new file mode 100644
index 0000000..21e8824
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp64q (n);
+ int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (0, 8), p);
+ vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (0, 8), va, p);
+ c+=2;
+ a+=2;
+ n-=2;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.64} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
new file mode 100644
index 0000000..d46571f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int8x16.c"
+
+int main ()
+{
+ int i;
+ int8_t temp1[N];
+ int8_t temp2[N];
+ int8_t temp3[N];
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus8 (temp1, temp2, temp3, 0);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus8 (temp1, temp2, temp3, 1);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 15);
+ check_plus8 (temp1, temp2, temp3, 15);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 16);
+ check_plus8 (temp1, temp2, temp3, 16);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 17);
+ check_plus8 (temp1, temp2, temp3, 17);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 32);
+ check_plus8 (temp1, temp2, temp3, 32);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 33);
+ check_plus8 (temp1, temp2, temp3, 33);
+
+ reset_data8 (temp1, temp2, temp3, N);
+}
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
new file mode 100644
index 0000000..d5f22b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ int8x16_t vb = vldrbq_z_s8 (b, p);
+ int8x16_t vc = vaddq_x_s8 (va, vb, p);
+ vstrbq_p_s8 (c, vc, p);
+ c+=16;
+ a+=16;
+ b+=16;
+ n-=16;
+ }
+}
+
+
+/* { dg-final { scan-assembler-times {\tdlstp.8} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
new file mode 100644
index 0000000..26df2d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
@@ -0,0 +1,521 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <limits.h>
+#include <arm_mve.h>
+
+/* Terminating on a non-zero number of elements. */
+void test0 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ while (n > 1)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Terminating on n >= 0. */
+void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ while (n >= 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Similar, terminating on a non-zero number of elements, but in a for loop
+ format. */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+ for (int i = num_elems; i >= 2; i-= 4)
+ {
+ mve_pred16_t p = vctp32q (i);
+ int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+ vstrwq_p_s32 (b + i, va, p);
+ }
+}
+
+/* Iteration counter counting up to num_iter, with a non-zero starting num. */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 1; i < num_iter; i++)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Iteration counter counting up to num_iter, with a larger increment */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 0; i < num_iter; i+=2)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Using an unpredicated store instruction within the loop. */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_u8 (va, vb);
+ uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+ vstrbq_u8 (d, vd);
+ n -= 16;
+ }
+}
+
+/* Using an unpredicated store outside the loop. */
+void test6 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+ vx = vaddq_u8 (vx, vc);
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ vstrbq_u8 (c, vx);
+}
+
+/* Using a VPR that gets modified within the loop. */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ p++;
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using a VPR that gets re-generated within the loop. */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ mve_pred16_t p = vctp32q (n);
+ while (n > 0)
+ {
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ p = vctp32q (n);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using vctp32q_m instead of vctp32q. */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q_m (n, p0);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+ outside the bb. This is invalid, because one of the inputs to the
+ unpredicated op is also unpredicated. */
+uint8_t test12 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+ uint8_t sum = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ uint8x16_t vc = vaddq_u8 (va, vb);
+ sum += vaddvq_u8 (vc);
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return sum;
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+ loop and then using that VPR to predicate a store insn. */
+void test13 (int32_t *a, int32_t *b, int32x4_t vc, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_s32 (a);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_s32 (va, vb);
+ mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+ vstrwq_p_s32 (c, vc, p1);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using an across-vector unpredicated instruction. "vb" is the risk. */
+uint16_t test14 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ vb = vaddq_u16 (va, vb);
+ res = vaddvq_u16 (vb);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/* Using an across-vector unpredicated instruction. "vc" is the risk. */
+uint16_t test15 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ uint16x8_t vc = vaddq_u16 (va, vb);
+ res = vaddvaq_u16 (res, vc);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+uint16_t test16 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16_t res =0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ res = vaddvaq_u16 (res, vb);
+ res = vaddvaq_p_u16 (res, va, p);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+int test17 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vmaxvq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+
+
+int test18 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vminvq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+int test19 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vminavq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+int test20 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ res = vminvq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+uint8x16_t test21 (uint8_t *a, uint32_t *b, int n, uint8x16_t res)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ res = vshlcq_u8 (va, b, 1);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+int8x16_t test22 (int8_t *a, int32_t *b, int n, int8x16_t res)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vshlcq_s8 (va, b, 1);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+/* Using an unsigned number of elements to count down from, with a >0*/
+void test23 (int32_t *a, int32_t *b, int32_t *c, unsigned int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+
+/* Using an unsigned number of elements to count up to, with a <n*/
+void test24 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+ for (int i = 0; i < n; i+=16)
+ {
+ mve_pred16_t p = vctp8q (n-i);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ }
+}
+
+
+/* Using an unsigned number of elements to count up to, with a <=n*/
+void test25 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+ for (int i = 1; i <= n; i+=16)
+ {
+ mve_pred16_t p = vctp8q (n-i+1);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ }
+}
+/* Update n twice in the loop. */
+void test26 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n >= 1)
+ {
+ n-=4;
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+
+void test27 (int32_t *a, int32_t *c, int n)
+{
+ int32_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_s32 (a);
+ res += vaddvq_s32 (va);
+ int32x4_t vc = vdupq_n_s32 (res);
+ vstrwq_p_s32 (c, vc, p);
+ a += 4;
+ n -= 4;
+ }
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+ loop and then using it in a predicated store insn. */
+void test28 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_s32 (b);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+ vstrwq_p_s32 (c, vc, p1);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+ outside the bb. The unpredicated lanes are not guaranteed zero, so would
+ affect the vaddv in the non-tail predicated case. */
+uint8_t test29 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+ uint8_t sum = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+ sum += vaddvq_u8 (vc);
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return sum;
+}
+
+/* Same as above, but with another scalar op between the unpredicated op and
+ the scalar op outside the loop. */
+uint8_t test30 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx, int g)
+{
+ uint8_t sum = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+ sum += vaddvq_u8 (vc);
+ sum += g;
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return sum;
+}
+
+uint8_t test31 (uint8_t *a, uint8_t *b, int n)
+{
+ uint8_t res = 0;
+ uint8x16_t vc = vdupq_n_u8 (0);
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ vc = vaddq (vb, vc);
+ res = vgetq_lane (vc, 5);
+
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return res;
+}
+
+uint8_t test32 (uint8_t *a, uint8_t *b, int n)
+{
+ uint8_t res = 0;
+ uint8x16_t vc = vdupq_n_u8 (0);
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ vc = vaddq_m (vc, va, vc, p);
+ vc = vaddq (vb, vc);
+ res = vgetq_lane (vc, 5);
+
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return res;
+}
+
+/* { dg-final { scan-assembler-not "\tdlstp" } } */
+/* { dg-final { scan-assembler-not "\tletp" } } */