aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sim/aarch64/ChangeLog11
-rw-r--r--sim/aarch64/simulator.c290
-rw-r--r--sim/testsuite/sim/aarch64/ChangeLog9
-rw-r--r--sim/testsuite/sim/aarch64/fcvtz.s1
-rw-r--r--sim/testsuite/sim/aarch64/fstur.s1
-rw-r--r--sim/testsuite/sim/aarch64/ldn_multiple.s136
-rw-r--r--sim/testsuite/sim/aarch64/ldn_single.s2
-rw-r--r--sim/testsuite/sim/aarch64/ldnr.s2
-rw-r--r--sim/testsuite/sim/aarch64/mla.s2
-rw-r--r--sim/testsuite/sim/aarch64/mls.s2
-rw-r--r--sim/testsuite/sim/aarch64/stn_multiple.s171
-rw-r--r--sim/testsuite/sim/aarch64/stn_single.s24
-rw-r--r--sim/testsuite/sim/aarch64/sumulh.s3
-rw-r--r--sim/testsuite/sim/aarch64/uzp.s2
14 files changed, 454 insertions, 202 deletions
diff --git a/sim/aarch64/ChangeLog b/sim/aarch64/ChangeLog
index 42379df..f1574a5 100644
--- a/sim/aarch64/ChangeLog
+++ b/sim/aarch64/ChangeLog
@@ -1,3 +1,14 @@
+2017-04-22 Jim Wilson <jim.wilson@linaro.org>
+
+ * simulator.c (vec_load): Add M argument. Rewrite to iterate over
+ registers based on structure size.
+ (LD4, LD3, LD2, LD1_2, LD1_3, LD1_4): Pass new arg to vec_load.
+ (LD1_1): Replace with call to vec_load.
+ (vec_store): Add new M argument. Rewrite to iterate over registers
+ based on structure size.
+ (ST4, ST3, ST2, ST1_2, ST1_3, ST1_4): Pass new arg to vec_store.
+ (ST1_1): Replace with call to vec_store.
+
2017-04-08 Jim Wilson <jim.wilson@linaro.org>
* simulator.c (do_vec_FCVTL): New.
diff --git a/sim/aarch64/simulator.c b/sim/aarch64/simulator.c
index 16d8d8d..18f7944 100644
--- a/sim/aarch64/simulator.c
+++ b/sim/aarch64/simulator.c
@@ -11524,310 +11524,224 @@ vec_reg (unsigned v, unsigned o)
return (v + o) & 0x3F;
}
-/* Load multiple N-element structures to N consecutive registers. */
+/* Load multiple N-element structures to M consecutive registers. */
static void
-vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
{
int all = INSTR (30, 30);
unsigned size = INSTR (11, 10);
unsigned vd = INSTR (4, 0);
- unsigned i;
+ unsigned rpt = (N == M) ? 1 : M;
+ unsigned selem = N;
+ unsigned i, j, k;
switch (size)
{
case 0: /* 8-bit operations. */
- if (all)
- for (i = 0; i < (16 * N); i++)
- aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
- aarch64_get_mem_u8 (cpu, address + i));
- else
- for (i = 0; i < (8 * N); i++)
- aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
- aarch64_get_mem_u8 (cpu, address + i));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (8 + (8 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
+ aarch64_get_mem_u8 (cpu, address));
+ address += 1;
+ }
return;
case 1: /* 16-bit operations. */
- if (all)
- for (i = 0; i < (8 * N); i++)
- aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
- aarch64_get_mem_u16 (cpu, address + i * 2));
- else
- for (i = 0; i < (4 * N); i++)
- aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
- aarch64_get_mem_u16 (cpu, address + i * 2));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (4 + (4 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
+ aarch64_get_mem_u16 (cpu, address));
+ address += 2;
+ }
return;
case 2: /* 32-bit operations. */
- if (all)
- for (i = 0; i < (4 * N); i++)
- aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
- aarch64_get_mem_u32 (cpu, address + i * 4));
- else
- for (i = 0; i < (2 * N); i++)
- aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
- aarch64_get_mem_u32 (cpu, address + i * 4));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (2 + (2 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
+ aarch64_get_mem_u32 (cpu, address));
+ address += 4;
+ }
return;
case 3: /* 64-bit operations. */
- if (all)
- for (i = 0; i < (2 * N); i++)
- aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
- aarch64_get_mem_u64 (cpu, address + i * 8));
- else
- for (i = 0; i < N; i++)
- aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
- aarch64_get_mem_u64 (cpu, address + i * 8));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (1 + all); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
+ aarch64_get_mem_u64 (cpu, address));
+ address += 8;
+ }
return;
}
}
-/* LD4: load multiple 4-element to four consecutive registers. */
+/* Load multiple 4-element structures into four consecutive registers. */
static void
LD4 (sim_cpu *cpu, uint64_t address)
{
- vec_load (cpu, address, 4);
+ vec_load (cpu, address, 4, 4);
}
-/* LD3: load multiple 3-element structures to three consecutive registers. */
+/* Load multiple 3-element structures into three consecutive registers. */
static void
LD3 (sim_cpu *cpu, uint64_t address)
{
- vec_load (cpu, address, 3);
+ vec_load (cpu, address, 3, 3);
}
-/* LD2: load multiple 2-element structures to two consecutive registers. */
+/* Load multiple 2-element structures into two consecutive registers. */
static void
LD2 (sim_cpu *cpu, uint64_t address)
{
- vec_load (cpu, address, 2);
+ vec_load (cpu, address, 2, 2);
}
/* Load multiple 1-element structures into one register. */
static void
LD1_1 (sim_cpu *cpu, uint64_t address)
{
- int all = INSTR (30, 30);
- unsigned size = INSTR (11, 10);
- unsigned vd = INSTR (4, 0);
- unsigned i;
-
- switch (size)
- {
- case 0:
- /* LD1 {Vd.16b}, addr, #16 */
- /* LD1 {Vd.8b}, addr, #8 */
- for (i = 0; i < (all ? 16 : 8); i++)
- aarch64_set_vec_u8 (cpu, vd, i,
- aarch64_get_mem_u8 (cpu, address + i));
- return;
-
- case 1:
- /* LD1 {Vd.8h}, addr, #16 */
- /* LD1 {Vd.4h}, addr, #8 */
- for (i = 0; i < (all ? 8 : 4); i++)
- aarch64_set_vec_u16 (cpu, vd, i,
- aarch64_get_mem_u16 (cpu, address + i * 2));
- return;
-
- case 2:
- /* LD1 {Vd.4s}, addr, #16 */
- /* LD1 {Vd.2s}, addr, #8 */
- for (i = 0; i < (all ? 4 : 2); i++)
- aarch64_set_vec_u32 (cpu, vd, i,
- aarch64_get_mem_u32 (cpu, address + i * 4));
- return;
-
- case 3:
- /* LD1 {Vd.2d}, addr, #16 */
- /* LD1 {Vd.1d}, addr, #8 */
- for (i = 0; i < (all ? 2 : 1); i++)
- aarch64_set_vec_u64 (cpu, vd, i,
- aarch64_get_mem_u64 (cpu, address + i * 8));
- return;
- }
+ vec_load (cpu, address, 1, 1);
}
/* Load multiple 1-element structures into two registers. */
static void
LD1_2 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the LD2 version.
- So why have two different instructions ? There must be something
- wrong somewhere. */
- vec_load (cpu, address, 2);
+ vec_load (cpu, address, 1, 2);
}
/* Load multiple 1-element structures into three registers. */
static void
LD1_3 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the LD3 version.
- So why have two different instructions ? There must be something
- wrong somewhere. */
- vec_load (cpu, address, 3);
+ vec_load (cpu, address, 1, 3);
}
/* Load multiple 1-element structures into four registers. */
static void
LD1_4 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the LD4 version.
- So why have two different instructions ? There must be something
- wrong somewhere. */
- vec_load (cpu, address, 4);
+ vec_load (cpu, address, 1, 4);
}
-/* Store multiple N-element structures to N consecutive registers. */
+/* Store multiple N-element structures from M consecutive registers. */
static void
-vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
{
int all = INSTR (30, 30);
unsigned size = INSTR (11, 10);
unsigned vd = INSTR (4, 0);
- unsigned i;
+ unsigned rpt = (N == M) ? 1 : M;
+ unsigned selem = N;
+ unsigned i, j, k;
switch (size)
{
case 0: /* 8-bit operations. */
- if (all)
- for (i = 0; i < (16 * N); i++)
- aarch64_set_mem_u8
- (cpu, address + i,
- aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
- else
- for (i = 0; i < (8 * N); i++)
- aarch64_set_mem_u8
- (cpu, address + i,
- aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (8 + (8 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_mem_u8
+ (cpu, address,
+ aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
+ address += 1;
+ }
return;
case 1: /* 16-bit operations. */
- if (all)
- for (i = 0; i < (8 * N); i++)
- aarch64_set_mem_u16
- (cpu, address + i * 2,
- aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
- else
- for (i = 0; i < (4 * N); i++)
- aarch64_set_mem_u16
- (cpu, address + i * 2,
- aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (4 + (4 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_mem_u16
+ (cpu, address,
+ aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
+ address += 2;
+ }
return;
case 2: /* 32-bit operations. */
- if (all)
- for (i = 0; i < (4 * N); i++)
- aarch64_set_mem_u32
- (cpu, address + i * 4,
- aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
- else
- for (i = 0; i < (2 * N); i++)
- aarch64_set_mem_u32
- (cpu, address + i * 4,
- aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (2 + (2 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_mem_u32
+ (cpu, address,
+ aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
+ address += 4;
+ }
return;
case 3: /* 64-bit operations. */
- if (all)
- for (i = 0; i < (2 * N); i++)
- aarch64_set_mem_u64
- (cpu, address + i * 8,
- aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
- else
- for (i = 0; i < N; i++)
- aarch64_set_mem_u64
- (cpu, address + i * 8,
- aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (1 + all); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_mem_u64
+ (cpu, address,
+ aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
+ address += 8;
+ }
return;
}
}
-/* Store multiple 4-element structure to four consecutive registers. */
+/* Store multiple 4-element structure from four consecutive registers. */
static void
ST4 (sim_cpu *cpu, uint64_t address)
{
- vec_store (cpu, address, 4);
+ vec_store (cpu, address, 4, 4);
}
-/* Store multiple 3-element structures to three consecutive registers. */
+/* Store multiple 3-element structures from three consecutive registers. */
static void
ST3 (sim_cpu *cpu, uint64_t address)
{
- vec_store (cpu, address, 3);
+ vec_store (cpu, address, 3, 3);
}
-/* Store multiple 2-element structures to two consecutive registers. */
+/* Store multiple 2-element structures from two consecutive registers. */
static void
ST2 (sim_cpu *cpu, uint64_t address)
{
- vec_store (cpu, address, 2);
+ vec_store (cpu, address, 2, 2);
}
-/* Store multiple 1-element structures into one register. */
+/* Store multiple 1-element structures from one register. */
static void
ST1_1 (sim_cpu *cpu, uint64_t address)
{
- int all = INSTR (30, 30);
- unsigned size = INSTR (11, 10);
- unsigned vd = INSTR (4, 0);
- unsigned i;
-
- switch (size)
- {
- case 0:
- for (i = 0; i < (all ? 16 : 8); i++)
- aarch64_set_mem_u8 (cpu, address + i,
- aarch64_get_vec_u8 (cpu, vd, i));
- return;
-
- case 1:
- for (i = 0; i < (all ? 8 : 4); i++)
- aarch64_set_mem_u16 (cpu, address + i * 2,
- aarch64_get_vec_u16 (cpu, vd, i));
- return;
-
- case 2:
- for (i = 0; i < (all ? 4 : 2); i++)
- aarch64_set_mem_u32 (cpu, address + i * 4,
- aarch64_get_vec_u32 (cpu, vd, i));
- return;
-
- case 3:
- for (i = 0; i < (all ? 2 : 1); i++)
- aarch64_set_mem_u64 (cpu, address + i * 8,
- aarch64_get_vec_u64 (cpu, vd, i));
- return;
- }
+ vec_store (cpu, address, 1, 1);
}
-/* Store multiple 1-element structures into two registers. */
+/* Store multiple 1-element structures from two registers. */
static void
ST1_2 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the ST2 version.
- So why have two different instructions ? There must be
- something wrong somewhere. */
- vec_store (cpu, address, 2);
+ vec_store (cpu, address, 1, 2);
}
-/* Store multiple 1-element structures into three registers. */
+/* Store multiple 1-element structures from three registers. */
static void
ST1_3 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the ST3 version.
- So why have two different instructions ? There must be
- something wrong somewhere. */
- vec_store (cpu, address, 3);
+ vec_store (cpu, address, 1, 3);
}
-/* Store multiple 1-element structures into four registers. */
+/* Store multiple 1-element structures from four registers. */
static void
ST1_4 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the ST4 version.
- So why have two different instructions ? There must be
- something wrong somewhere. */
- vec_store (cpu, address, 4);
+ vec_store (cpu, address, 1, 4);
}
#define LDn_STn_SINGLE_LANE_AND_SIZE() \
diff --git a/sim/testsuite/sim/aarch64/ChangeLog b/sim/testsuite/sim/aarch64/ChangeLog
index cf0da6d..f4671da 100644
--- a/sim/testsuite/sim/aarch64/ChangeLog
+++ b/sim/testsuite/sim/aarch64/ChangeLog
@@ -1,3 +1,12 @@
+2017-04-22 Jim Wilson <jim.wilson@linaro.org>
+
+ * fcvtz.s, fstur.s, ldn_single.s, ldnr.s, mla.s, mls.s, uzp.s: Align
+ data.
+ * sumulh.s: Delete unnecessary data alignment.
+ * stn_single.s: Align data. Fix unaligned ldr insns. Adjust cmp
+ arguments to match change.
+ * ldn_multiple.s, stn_multiple.s: New.
+
2017-04-08 Jim Wilson <jim.wilson@linaro.org>
* fcvtl.s: New.
diff --git a/sim/testsuite/sim/aarch64/fcvtz.s b/sim/testsuite/sim/aarch64/fcvtz.s
index 9bb6f9b..311fc2e 100644
--- a/sim/testsuite/sim/aarch64/fcvtz.s
+++ b/sim/testsuite/sim/aarch64/fcvtz.s
@@ -8,6 +8,7 @@
# For 64-bit unsigned convert, test values 1.5, LONG_MAX, and ULONG_MAX.
.data
+ .align 4
fm1p5:
.word 3217031168
fimax:
diff --git a/sim/testsuite/sim/aarch64/fstur.s b/sim/testsuite/sim/aarch64/fstur.s
index 2206ae5..80e5c67 100644
--- a/sim/testsuite/sim/aarch64/fstur.s
+++ b/sim/testsuite/sim/aarch64/fstur.s
@@ -8,6 +8,7 @@
.include "testutils.inc"
.data
+ .align 4
fm1:
.word 3212836864
fmax:
diff --git a/sim/testsuite/sim/aarch64/ldn_multiple.s b/sim/testsuite/sim/aarch64/ldn_multiple.s
new file mode 100644
index 0000000..285ef7e
--- /dev/null
+++ b/sim/testsuite/sim/aarch64/ldn_multiple.s
@@ -0,0 +1,136 @@
+# mach: aarch64
+
+# Check the load multiple structure instructions: ld1, ld2, ld3, ld4.
+# Check the addressing modes: no offset, post-index immediate offset,
+# post-index register offset.
+
+.include "testutils.inc"
+
+ .data
+ .align 4
+input:
+ .word 0x04030201
+ .word 0x08070605
+ .word 0x0c0b0a09
+ .word 0x100f0e0d
+ .word 0xfcfdfeff
+ .word 0xf8f9fafb
+ .word 0xf4f5f6f7
+ .word 0xf0f1f2f3
+
+ start
+ adrp x0, input
+ add x0, x0, :lo12:input
+
+ mov x2, x0
+ mov x3, #16
+ ld1 {v0.16b}, [x2], 16
+ ld1 {v1.8h}, [x2], x3
+ addv b4, v0.16b
+ addv b5, v1.16b
+ mov x4, v4.d[0]
+ cmp x4, #136
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #120
+ bne .Lfailure
+
+ mov x2, x0
+ mov x3, #16
+ ld2 {v0.8b, v1.8b}, [x2], x3
+ ld2 {v2.4h, v3.4h}, [x2], 16
+ addv b4, v0.8b
+ addv b5, v1.8b
+ addv b6, v2.8b
+ addv b7, v3.8b
+ mov x4, v4.d[0]
+ cmp x4, #64
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #72
+ bne .Lfailure
+ mov x6, v6.d[0]
+ cmp x6, #196
+ bne .Lfailure
+ mov x7, v7.d[0]
+ cmp x7, #180
+ bne .Lfailure
+
+ mov x2, x0
+ ld3 {v0.2s, v1.2s, v2.2s}, [x2]
+ addv b4, v0.8b
+ addv b5, v1.8b
+ addv b6, v2.8b
+ mov x4, v4.d[0]
+ cmp x4, #68
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #16
+ bne .Lfailure
+ mov x6, v6.d[0]
+ cmp x6, #16
+ bne .Lfailure
+
+ mov x2, x0
+ ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x2]
+ addv b4, v0.8b
+ addv b5, v1.8b
+ addv b6, v2.8b
+ addv b7, v3.8b
+ mov x4, v4.d[0]
+ cmp x4, #0
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #0
+ bne .Lfailure
+ mov x6, v6.d[0]
+ cmp x6, #0
+ bne .Lfailure
+ mov x7, v7.d[0]
+ cmp x7, #0
+ bne .Lfailure
+
+ mov x2, x0
+ ld1 {v0.4s, v1.4s}, [x2]
+ addv b4, v0.16b
+ addv b5, v1.16b
+ mov x4, v4.d[0]
+ cmp x4, #136
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #120
+ bne .Lfailure
+
+ mov x2, x0
+ ld1 {v0.1d, v1.1d, v2.1d}, [x2]
+ addv b4, v0.8b
+ addv b5, v1.8b
+ addv b6, v2.8b
+ mov x4, v4.d[0]
+ cmp x4, #36
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #100
+ bne .Lfailure
+ mov x6, v6.d[0]
+ cmp x6, #220
+ bne .Lfailure
+
+ mov x2, x0
+ ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x2]
+ addv b4, v0.8b
+ addv b5, v1.8b
+ addv b6, v2.8b
+ mov x4, v4.d[0]
+ cmp x4, #36
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #100
+ bne .Lfailure
+ mov x6, v6.d[0]
+ cmp x6, #220
+ bne .Lfailure
+
+ pass
+.Lfailure:
+ fail
diff --git a/sim/testsuite/sim/aarch64/ldn_single.s b/sim/testsuite/sim/aarch64/ldn_single.s
index 4c460fb..9681520 100644
--- a/sim/testsuite/sim/aarch64/ldn_single.s
+++ b/sim/testsuite/sim/aarch64/ldn_single.s
@@ -7,6 +7,8 @@
.include "testutils.inc"
+ .data
+ .align 4
input:
.word 0x04030201
.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/ldnr.s b/sim/testsuite/sim/aarch64/ldnr.s
index a4bfffa..7126c46 100644
--- a/sim/testsuite/sim/aarch64/ldnr.s
+++ b/sim/testsuite/sim/aarch64/ldnr.s
@@ -7,6 +7,8 @@
.include "testutils.inc"
+ .data
+ .align 4
input:
.word 0x04030201
.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/mla.s b/sim/testsuite/sim/aarch64/mla.s
index e0065e7..e3ea836 100644
--- a/sim/testsuite/sim/aarch64/mla.s
+++ b/sim/testsuite/sim/aarch64/mla.s
@@ -4,6 +4,8 @@
.include "testutils.inc"
+ .data
+ .align 4
input:
.word 0x04030201
.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/mls.s b/sim/testsuite/sim/aarch64/mls.s
index a34a1aa..5c9e225 100644
--- a/sim/testsuite/sim/aarch64/mls.s
+++ b/sim/testsuite/sim/aarch64/mls.s
@@ -4,6 +4,8 @@
.include "testutils.inc"
+ .data
+ .align 4
input:
.word 0x04030201
.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/stn_multiple.s b/sim/testsuite/sim/aarch64/stn_multiple.s
new file mode 100644
index 0000000..1a3f24d
--- /dev/null
+++ b/sim/testsuite/sim/aarch64/stn_multiple.s
@@ -0,0 +1,171 @@
+# mach: aarch64
+
+# Check the store multiple structure instructions: st1, st2, st3, st4.
+# Check the addressing modes: no offset, post-index immediate offset,
+# post-index register offset.
+
+.include "testutils.inc"
+
+ .data
+ .align 4
+input:
+ .word 0x04030201
+ .word 0x08070605
+ .word 0x0c0b0a09
+ .word 0x100f0e0d
+ .word 0xfcfdfeff
+ .word 0xf8f9fafb
+ .word 0xf4f5f6f7
+ .word 0xf0f1f2f3
+output:
+ .zero 64
+
+ start
+ adrp x0, input
+ add x0, x0, :lo12:input
+ adrp x1, output
+ add x1, x1, :lo12:output
+
+ mov x2, x0
+ ldr q0, [x2], 16
+ ldr q1, [x2]
+ mov x2, x0
+ ldr q2, [x2], 16
+ ldr q3, [x2]
+
+ mov x2, x1
+ mov x3, #16
+ st1 {v0.16b}, [x2], 16
+ st1 {v1.8h}, [x2], x3
+ mov x2, x1
+ ldr q4, [x2], 16
+ ldr q5, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ mov x4, v4.d[0]
+ cmp x4, #136
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #120
+ bne .Lfailure
+
+ mov x2, x1
+ mov x3, #16
+ st2 {v0.8b, v1.8b}, [x2], 16
+ st2 {v2.4h, v3.4h}, [x2], x3
+ mov x2, x1
+ ldr q4, [x2], 16
+ ldr q5, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ mov x4, v4.d[0]
+ cmp x4, #0
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #0
+ bne .Lfailure
+
+ mov x2, x1
+ st3 {v0.4s, v1.4s, v2.4s}, [x2]
+ ldr q4, [x2], 16
+ ldr q5, [x2], 16
+ ldr q6, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ addv b6, v6.16b
+ mov x4, v4.d[0]
+ cmp x4, #36
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #0
+ bne .Lfailure
+ mov x6, v6.d[0]
+ cmp x6, #100
+ bne .Lfailure
+
+ mov x2, x1
+ st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
+ ldr q4, [x2], 16
+ ldr q5, [x2], 16
+ ldr q6, [x2], 16
+ ldr q7, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ addv b6, v6.16b
+ addv b7, v7.16b
+ mov x4, v4.d[0]
+ cmp x4, #0
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #0
+ bne .Lfailure
+ mov x6, v6.d[0]
+ cmp x6, #0
+ bne .Lfailure
+ mov x7, v7.d[0]
+ cmp x7, #0
+ bne .Lfailure
+
+ pass
+
+ mov x2, x1
+ st1 {v0.2s, v1.2s}, [x2], 16
+ st1 {v2.1d, v3.1d}, [x2]
+ mov x2, x1
+ ldr q4, [x2], 16
+ ldr q5, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ mov x4, v4.d[0]
+ cmp x4, #0
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #0
+ bne .Lfailure
+
+ mov x2, x1
+ st1 {v0.2d, v1.2d, v2.2d}, [x2]
+ mov x2, x1
+ ldr q4, [x2], 16
+ ldr q5, [x2], 16
+ ldr q6, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ addv b6, v6.16b
+ mov x4, v4.d[0]
+ cmp x4, #136
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #120
+ bne .Lfailure
+ mov x6, v6.d[0]
+ cmp x6, #136
+ bne .Lfailure
+
+ mov x2, x1
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
+ mov x2, x1
+ ldr q4, [x2], 16
+ ldr q5, [x2], 16
+ ldr q6, [x2], 16
+ ldr q7, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ addv b6, v6.16b
+ addv b7, v7.16b
+ mov x4, v4.d[0]
+ cmp x4, #136
+ bne .Lfailure
+ mov x5, v5.d[0]
+ cmp x5, #120
+ bne .Lfailure
+ mov x6, v6.d[0]
+ cmp x6, #136
+ bne .Lfailure
+ mov x7, v7.d[0]
+ cmp x7, #120
+ bne .Lfailure
+
+ pass
+.Lfailure:
+ fail
diff --git a/sim/testsuite/sim/aarch64/stn_single.s b/sim/testsuite/sim/aarch64/stn_single.s
index 2bd19cf..a24b084 100644
--- a/sim/testsuite/sim/aarch64/stn_single.s
+++ b/sim/testsuite/sim/aarch64/stn_single.s
@@ -7,6 +7,8 @@
.include "testutils.inc"
+ .data
+ .align 4
input:
.word 0x04030201
.word 0x08070605
@@ -26,10 +28,10 @@ output:
add x1, x1, :lo12:output
mov x2, x0
- ldr q0, [x2], 8
+ ldr q0, [x2], 16
ldr q1, [x2]
mov x2, x0
- ldr q2, [x2], 8
+ ldr q2, [x2], 16
ldr q3, [x2]
mov x2, x1
@@ -61,9 +63,9 @@ output:
addv b5, v5.16b
mov x5, v4.d[0]
mov x6, v5.d[0]
- cmp x5, #136
+ cmp x5, #200
bne .Lfailure
- cmp x6, #8
+ cmp x6, #72
bne .Lfailure
mov x2, x1
@@ -82,11 +84,11 @@ output:
mov x4, v4.d[0]
mov x5, v5.d[0]
mov x6, v6.d[0]
- cmp x4, #88
+ cmp x4, #120
bne .Lfailure
- cmp x5, #200
+ cmp x5, #8
bne .Lfailure
- cmp x6, #248
+ cmp x6, #24
bne .Lfailure
mov x2, x1
@@ -108,13 +110,13 @@ output:
mov x5, v5.d[0]
mov x6, v6.d[0]
mov x7, v7.d[0]
- cmp x4, #104
+ cmp x4, #168
bne .Lfailure
- cmp x5, #168
+ cmp x5, #232
bne .Lfailure
- cmp x6, #232
+ cmp x6, #40
bne .Lfailure
- cmp x7, #40
+ cmp x7, #104
bne .Lfailure
pass
diff --git a/sim/testsuite/sim/aarch64/sumulh.s b/sim/testsuite/sim/aarch64/sumulh.s
index 17f1ecd..d75e0c6 100644
--- a/sim/testsuite/sim/aarch64/sumulh.s
+++ b/sim/testsuite/sim/aarch64/sumulh.s
@@ -6,9 +6,6 @@
.include "testutils.inc"
- .data
- .align 4
-
start
mov x0, #-2
diff --git a/sim/testsuite/sim/aarch64/uzp.s b/sim/testsuite/sim/aarch64/uzp.s
index 55e2cd7..851005e 100644
--- a/sim/testsuite/sim/aarch64/uzp.s
+++ b/sim/testsuite/sim/aarch64/uzp.s
@@ -4,6 +4,8 @@
.include "testutils.inc"
+ .data
+ .align 4
input1:
.word 0x04030201
.word 0x08070605