aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sim/aarch64/ChangeLog12
-rw-r--r--sim/aarch64/simulator.c551
-rw-r--r--sim/testsuite/sim/aarch64/ChangeLog6
-rw-r--r--sim/testsuite/sim/aarch64/ldn_single.s100
-rw-r--r--sim/testsuite/sim/aarch64/ldnr.s176
-rw-r--r--sim/testsuite/sim/aarch64/stn_single.s122
6 files changed, 698 insertions, 269 deletions
diff --git a/sim/aarch64/ChangeLog b/sim/aarch64/ChangeLog
index f9a62e7..2a21fc3 100644
--- a/sim/aarch64/ChangeLog
+++ b/sim/aarch64/ChangeLog
@@ -1,3 +1,15 @@
+2017-02-14 Jim Wilson <jim.wilson@linaro.org>
+
+ * simulator.c: (LDn_STn_SINGLE_LANE_AND_SIZE): New.
+ (do_vec_LDn_single, do_vec_STn_single): New.
+ (do_vec_LDnR): Add and set new nregs var. Replace switch on nregs with
+ loop over nregs using new var n. Add n times size to address in loop.
+ Add n to vd in loop.
+ (do_vec_load_store): Add comment for instruction bit 24. New var
+ single to hold instruction bit 24. Add new code to use single. Move
+ ldnr support inside single if statements. Fix ldnr register counts
+ inside post if statement. Change HALT_NYI calls to HALT_UNALLOC.
+
2017-01-23 Jim Wilson <jim.wilson@linaro.org>
* simulator.c (do_vec_compare): Add case 0x23 for CMTST.
diff --git a/sim/aarch64/simulator.c b/sim/aarch64/simulator.c
index a44e70a..403edb7 100644
--- a/sim/aarch64/simulator.c
+++ b/sim/aarch64/simulator.c
@@ -11560,284 +11560,246 @@ ST1_4 (sim_cpu *cpu, uint64_t address)
vec_store (cpu, address, 4);
}
+#define LDn_STn_SINGLE_LANE_AND_SIZE() \
+ do \
+ { \
+ switch (INSTR (15, 14)) \
+ { \
+ case 0: \
+ lane = (full << 3) | (s << 2) | size; \
+ size = 0; \
+ break; \
+ \
+ case 1: \
+ if ((size & 1) == 1) \
+ HALT_UNALLOC; \
+ lane = (full << 2) | (s << 1) | (size >> 1); \
+ size = 1; \
+ break; \
+ \
+ case 2: \
+ if ((size & 2) == 2) \
+ HALT_UNALLOC; \
+ \
+ if ((size & 1) == 0) \
+ { \
+ lane = (full << 1) | s; \
+ size = 2; \
+ } \
+ else \
+ { \
+ if (s) \
+ HALT_UNALLOC; \
+ lane = full; \
+ size = 3; \
+ } \
+ break; \
+ \
+ default: \
+ HALT_UNALLOC; \
+ } \
+ } \
+ while (0)
+
+/* Load single structure into one lane of N registers. */
static void
-do_vec_LDnR (sim_cpu *cpu, uint64_t address)
+do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
{
/* instr[31] = 0
instr[30] = element selector 0=>half, 1=>all elements
instr[29,24] = 00 1101
instr[23] = 0=>simple, 1=>post
instr[22] = 1
- instr[21] = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
+ instr[21] = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11111 (immediate post inc)
- instr[15,14] = 11
- instr[13] = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
- instr[12] = 0
- instr[11,10] = element size 00=> byte(b), 01=> half(h),
- 10=> word(s), 11=> double(d)
+ instr[15,13] = opcode
+ instr[12] = S, used for lane number
+ instr[11,10] = size, also used for lane number
instr[9,5] = address
instr[4,0] = Vd */
unsigned full = INSTR (30, 30);
unsigned vd = INSTR (4, 0);
unsigned size = INSTR (11, 10);
+ unsigned s = INSTR (12, 12);
+ int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+ int lane = 0;
int i;
NYI_assert (29, 24, 0x0D);
NYI_assert (22, 22, 1);
- NYI_assert (15, 14, 3);
- NYI_assert (12, 12, 0);
- switch ((INSTR (13, 13) << 1) | INSTR (21, 21))
- {
- case 0: /* LD1R. */
- switch (size)
- {
- case 0:
- {
- uint8_t val = aarch64_get_mem_u8 (cpu, address);
- for (i = 0; i < (full ? 16 : 8); i++)
- aarch64_set_vec_u8 (cpu, vd, i, val);
- break;
- }
-
- case 1:
- {
- uint16_t val = aarch64_get_mem_u16 (cpu, address);
- for (i = 0; i < (full ? 8 : 4); i++)
- aarch64_set_vec_u16 (cpu, vd, i, val);
- break;
- }
-
- case 2:
- {
- uint32_t val = aarch64_get_mem_u32 (cpu, address);
- for (i = 0; i < (full ? 4 : 2); i++)
- aarch64_set_vec_u32 (cpu, vd, i, val);
- break;
- }
-
- case 3:
- {
- uint64_t val = aarch64_get_mem_u64 (cpu, address);
- for (i = 0; i < (full ? 2 : 1); i++)
- aarch64_set_vec_u64 (cpu, vd, i, val);
- break;
- }
+ /* Compute the lane number first (using size), and then compute size. */
+ LDn_STn_SINGLE_LANE_AND_SIZE ();
- default:
- HALT_UNALLOC;
+ for (i = 0; i < nregs; i++)
+ switch (size)
+ {
+ case 0:
+ {
+ uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
+ aarch64_set_vec_u8 (cpu, vd + i, lane, val);
+ break;
}
- break;
- case 1: /* LD2R. */
- switch (size)
+ case 1:
{
- case 0:
- {
- uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
- uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
-
- for (i = 0; i < (full ? 16 : 8); i++)
- {
- aarch64_set_vec_u8 (cpu, vd, 0, val1);
- aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
- }
- break;
- }
+ uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
+ aarch64_set_vec_u16 (cpu, vd + i, lane, val);
+ break;
+ }
- case 1:
- {
- uint16_t val1 = aarch64_get_mem_u16 (cpu, address);
- uint16_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
+ case 2:
+ {
+ uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
+ aarch64_set_vec_u32 (cpu, vd + i, lane, val);
+ break;
+ }
- for (i = 0; i < (full ? 8 : 4); i++)
- {
- aarch64_set_vec_u16 (cpu, vd, 0, val1);
- aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
- }
- break;
- }
+ case 3:
+ {
+ uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
+ aarch64_set_vec_u64 (cpu, vd + i, lane, val);
+ break;
+ }
+ }
+}
- case 2:
- {
- uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
+/* Store single structure from one lane from N registers. */
+static void
+do_vec_STn_single (sim_cpu *cpu, uint64_t address)
+{
+ /* instr[31] = 0
+ instr[30] = element selector 0=>half, 1=>all elements
+ instr[29,24] = 00 1101
+ instr[23] = 0=>simple, 1=>post
+ instr[22] = 0
+ instr[21] = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
+ instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
+ 11111 (immediate post inc)
+ instr[15,13] = opcode
+ instr[12] = S, used for lane number
+ instr[11,10] = size, also used for lane number
+ instr[9,5] = address
+ instr[4,0] = Vd */
- for (i = 0; i < (full ? 4 : 2); i++)
- {
- aarch64_set_vec_u32 (cpu, vd, 0, val1);
- aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
- }
- break;
- }
+ unsigned full = INSTR (30, 30);
+ unsigned vd = INSTR (4, 0);
+ unsigned size = INSTR (11, 10);
+ unsigned s = INSTR (12, 12);
+ int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+ int lane = 0;
+ int i;
- case 3:
- {
- uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
- uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
+ NYI_assert (29, 24, 0x0D);
+ NYI_assert (22, 22, 0);
- for (i = 0; i < (full ? 2 : 1); i++)
- {
- aarch64_set_vec_u64 (cpu, vd, 0, val1);
- aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
- }
- break;
- }
+ /* Compute the lane number first (using size), and then compute size. */
+ LDn_STn_SINGLE_LANE_AND_SIZE ();
- default:
- HALT_UNALLOC;
+ for (i = 0; i < nregs; i++)
+ switch (size)
+ {
+ case 0:
+ {
+ uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
+ aarch64_set_mem_u8 (cpu, address + i, val);
+ break;
}
- break;
- case 2: /* LD3R. */
- switch (size)
+ case 1:
{
- case 0:
- {
- uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
- uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
- uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
-
- for (i = 0; i < (full ? 16 : 8); i++)
- {
- aarch64_set_vec_u8 (cpu, vd, 0, val1);
- aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
- }
- }
+ uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
+ aarch64_set_mem_u16 (cpu, address + (i * 2), val);
break;
+ }
- case 1:
- {
- uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
- uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
-
- for (i = 0; i < (full ? 8 : 4); i++)
- {
- aarch64_set_vec_u16 (cpu, vd, 0, val1);
- aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
- }
- }
+ case 2:
+ {
+ uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
+ aarch64_set_mem_u32 (cpu, address + (i * 4), val);
break;
+ }
- case 2:
- {
- uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
- uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
-
- for (i = 0; i < (full ? 4 : 2); i++)
- {
- aarch64_set_vec_u32 (cpu, vd, 0, val1);
- aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
- }
- }
+ case 3:
+ {
+ uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
+ aarch64_set_mem_u64 (cpu, address + (i * 8), val);
break;
+ }
+ }
+}
- case 3:
- {
- uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
- uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
- uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
+/* Load single structure into all lanes of N registers. */
+static void
+do_vec_LDnR (sim_cpu *cpu, uint64_t address)
+{
+ /* instr[31] = 0
+ instr[30] = element selector 0=>half, 1=>all elements
+ instr[29,24] = 00 1101
+ instr[23] = 0=>simple, 1=>post
+ instr[22] = 1
+ instr[21] = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
+ instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
+ 11111 (immediate post inc)
+ instr[15,14] = 11
+ instr[13] = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
+ instr[12] = 0
+ instr[11,10] = element size 00=> byte(b), 01=> half(h),
+ 10=> word(s), 11=> double(d)
+ instr[9,5] = address
+ instr[4,0] = Vd */
- for (i = 0; i < (full ? 2 : 1); i++)
- {
- aarch64_set_vec_u64 (cpu, vd, 0, val1);
- aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
- }
- }
- break;
+ unsigned full = INSTR (30, 30);
+ unsigned vd = INSTR (4, 0);
+ unsigned size = INSTR (11, 10);
+ int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+ int i, n;
- default:
- HALT_UNALLOC;
- }
- break;
+ NYI_assert (29, 24, 0x0D);
+ NYI_assert (22, 22, 1);
+ NYI_assert (15, 14, 3);
+ NYI_assert (12, 12, 0);
- case 3: /* LD4R. */
- switch (size)
+ for (n = 0; n < nregs; n++)
+ switch (size)
+ {
+ case 0:
{
- case 0:
- {
- uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
- uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
- uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
- uint8_t val4 = aarch64_get_mem_u8 (cpu, address + 3);
-
- for (i = 0; i < (full ? 16 : 8); i++)
- {
- aarch64_set_vec_u8 (cpu, vd, 0, val1);
- aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
- aarch64_set_vec_u8 (cpu, vd + 3, 0, val4);
- }
- }
+ uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
+ for (i = 0; i < (full ? 16 : 8); i++)
+ aarch64_set_vec_u8 (cpu, vd + n, i, val);
break;
+ }
- case 1:
- {
- uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
- uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
- uint32_t val4 = aarch64_get_mem_u16 (cpu, address + 6);
-
- for (i = 0; i < (full ? 8 : 4); i++)
- {
- aarch64_set_vec_u16 (cpu, vd, 0, val1);
- aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
- aarch64_set_vec_u16 (cpu, vd + 3, 0, val4);
- }
- }
+ case 1:
+ {
+ uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
+ for (i = 0; i < (full ? 8 : 4); i++)
+ aarch64_set_vec_u16 (cpu, vd + n, i, val);
break;
+ }
- case 2:
- {
- uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
- uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
- uint32_t val4 = aarch64_get_mem_u32 (cpu, address + 12);
-
- for (i = 0; i < (full ? 4 : 2); i++)
- {
- aarch64_set_vec_u32 (cpu, vd, 0, val1);
- aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
- aarch64_set_vec_u32 (cpu, vd + 3, 0, val4);
- }
- }
+ case 2:
+ {
+ uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
+ for (i = 0; i < (full ? 4 : 2); i++)
+ aarch64_set_vec_u32 (cpu, vd + n, i, val);
break;
+ }
- case 3:
- {
- uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
- uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
- uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
- uint64_t val4 = aarch64_get_mem_u64 (cpu, address + 24);
-
- for (i = 0; i < (full ? 2 : 1); i++)
- {
- aarch64_set_vec_u64 (cpu, vd, 0, val1);
- aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
- aarch64_set_vec_u64 (cpu, vd + 3, 0, val4);
- }
- }
+ case 3:
+ {
+ uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
+ for (i = 0; i < (full ? 2 : 1); i++)
+ aarch64_set_vec_u64 (cpu, vd + n, i, val);
break;
-
- default:
- HALT_UNALLOC;
}
- break;
- default:
- HALT_UNALLOC;
- }
+ default:
+ HALT_UNALLOC;
+ }
}
static void
@@ -11848,7 +11810,7 @@ do_vec_load_store (sim_cpu *cpu)
instr[31] = 0
instr[30] = element selector 0=>half, 1=>all elements
instr[29,25] = 00110
- instr[24] = ?
+ instr[24] = 0=>multiple struct, 1=>single struct
instr[23] = 0=>simple, 1=>post
instr[22] = 0=>store, 1=>load
instr[21] = 0 (LDn) / small(0)-large(1) selector (LDnR)
@@ -11876,6 +11838,7 @@ do_vec_load_store (sim_cpu *cpu)
instr[9,5] = Vn, can be SP
instr[4,0] = Vd */
+ int single;
int post;
int load;
unsigned vn;
@@ -11885,15 +11848,16 @@ do_vec_load_store (sim_cpu *cpu)
if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
HALT_NYI;
- type = INSTR (15, 12);
- if (type != 0xE && type != 0xE && INSTR (21, 21) != 0)
- HALT_NYI;
-
+ single = INSTR (24, 24);
post = INSTR (23, 23);
load = INSTR (22, 22);
+ type = INSTR (15, 12);
vn = INSTR (9, 5);
address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
+ if (! single && INSTR (21, 21) != 0)
+ HALT_UNALLOC;
+
if (post)
{
unsigned vm = INSTR (20, 16);
@@ -11902,49 +11866,78 @@ do_vec_load_store (sim_cpu *cpu)
{
unsigned sizeof_operation;
- switch (type)
+ if (single)
{
- case 0: sizeof_operation = 32; break;
- case 4: sizeof_operation = 24; break;
- case 8: sizeof_operation = 16; break;
-
- case 0xC:
- sizeof_operation = INSTR (21, 21) ? 2 : 1;
- sizeof_operation <<= INSTR (11, 10);
- break;
+ if ((type >= 0) && (type <= 11))
+ {
+ int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+ switch (INSTR (15, 14))
+ {
+ case 0:
+ sizeof_operation = nregs * 1;
+ break;
+ case 1:
+ sizeof_operation = nregs * 2;
+ break;
+ case 2:
+ if (INSTR (10, 10) == 0)
+ sizeof_operation = nregs * 4;
+ else
+ sizeof_operation = nregs * 8;
+ break;
+ default:
+ HALT_UNALLOC;
+ }
+ }
+ else if (type == 0xC)
+ {
+ sizeof_operation = INSTR (21, 21) ? 2 : 1;
+ sizeof_operation <<= INSTR (11, 10);
+ }
+ else if (type == 0xE)
+ {
+ sizeof_operation = INSTR (21, 21) ? 4 : 3;
+ sizeof_operation <<= INSTR (11, 10);
+ }
+ else
+ HALT_UNALLOC;
+ }
+ else
+ {
+ switch (type)
+ {
+ case 0: sizeof_operation = 32; break;
+ case 4: sizeof_operation = 24; break;
+ case 8: sizeof_operation = 16; break;
- case 0xE:
- sizeof_operation = INSTR (21, 21) ? 8 : 4;
- sizeof_operation <<= INSTR (11, 10);
- break;
+ case 7:
+ /* One register, immediate offset variant. */
+ sizeof_operation = 8;
+ break;
- case 7:
- /* One register, immediate offset variant. */
- sizeof_operation = 8;
- break;
+ case 10:
+ /* Two registers, immediate offset variant. */
+ sizeof_operation = 16;
+ break;
- case 10:
- /* Two registers, immediate offset variant. */
- sizeof_operation = 16;
- break;
+ case 6:
+ /* Three registers, immediate offset variant. */
+ sizeof_operation = 24;
+ break;
- case 6:
- /* Three registers, immediate offset variant. */
- sizeof_operation = 24;
- break;
+ case 2:
+ /* Four registers, immediate offset variant. */
+ sizeof_operation = 32;
+ break;
- case 2:
- /* Four registers, immediate offset variant. */
- sizeof_operation = 32;
- break;
+ default:
+ HALT_UNALLOC;
+ }
- default:
- HALT_UNALLOC;
+ if (INSTR (30, 30))
+ sizeof_operation *= 2;
}
- if (INSTR (30, 30))
- sizeof_operation *= 2;
-
aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
}
else
@@ -11956,6 +11949,29 @@ do_vec_load_store (sim_cpu *cpu)
NYI_assert (20, 16, 0);
}
+ if (single)
+ {
+ if (load)
+ {
+ if ((type >= 0) && (type <= 11))
+ do_vec_LDn_single (cpu, address);
+ else if ((type == 0xC) || (type == 0xE))
+ do_vec_LDnR (cpu, address);
+ else
+ HALT_UNALLOC;
+ return;
+ }
+
+ /* Stores. */
+ if ((type >= 0) && (type <= 11))
+ {
+ do_vec_STn_single (cpu, address);
+ return;
+ }
+
+ HALT_UNALLOC;
+ }
+
if (load)
{
switch (type)
@@ -11968,11 +11984,8 @@ do_vec_load_store (sim_cpu *cpu)
case 10: LD1_2 (cpu, address); return;
case 7: LD1_1 (cpu, address); return;
- case 0xE:
- case 0xC: do_vec_LDnR (cpu, address); return;
-
default:
- HALT_NYI;
+ HALT_UNALLOC;
}
}
@@ -11987,7 +12000,7 @@ do_vec_load_store (sim_cpu *cpu)
case 10: ST1_2 (cpu, address); return;
case 7: ST1_1 (cpu, address); return;
default:
- HALT_NYI;
+ HALT_UNALLOC;
}
}
diff --git a/sim/testsuite/sim/aarch64/ChangeLog b/sim/testsuite/sim/aarch64/ChangeLog
index 6a39354..86940e2 100644
--- a/sim/testsuite/sim/aarch64/ChangeLog
+++ b/sim/testsuite/sim/aarch64/ChangeLog
@@ -1,3 +1,9 @@
+2017-02-14 Jim Wilson <jim.wilson@linaro.org>
+
+ * ldn_single.s: New.
+ * ldnr.s: New.
+ * stn_single.s: New.
+
2017-01-23 Jim Wilson <jim.wilson@linaro.org>
* cmtst.s: New.
diff --git a/sim/testsuite/sim/aarch64/ldn_single.s b/sim/testsuite/sim/aarch64/ldn_single.s
new file mode 100644
index 0000000..3102e9e
--- /dev/null
+++ b/sim/testsuite/sim/aarch64/ldn_single.s
@@ -0,0 +1,100 @@
+# mach: aarch64
+
+# Check the load single 1-element structure to one lane instructions:
+# ld1, ld2, ld3, ld4.
+# Check the addressing modes: no offset, post-index immediate offset,
+# post-index register offset.
+
+.include "testutils.inc"
+
+input:
+ .word 0x04030201
+ .word 0x08070605
+ .word 0x0c0b0a09
+ .word 0x100f0e0d
+ .word 0x14131211
+ .word 0x18171615
+ .word 0x1c1b1a19
+ .word 0x201f1e1d
+
+ start
+ adrp x0, input
+ add x0, x0, :lo12:input
+
+ mov x2, x0
+ mov x3, #1
+ mov x4, #4
+ ld1 {v0.b}[0], [x2], 1
+ ld1 {v0.b}[1], [x2], x3
+ ld1 {v0.h}[1], [x2], 2
+ ld1 {v0.s}[1], [x2], x4
+ ld1 {v0.d}[1], [x2]
+ addv b1, v0.16b
+ mov x5, v1.d[0]
+ cmp x5, #136
+ bne .Lfailure
+
+ mov x2, x0
+ mov x3, #16
+ mov x4, #4
+ ld2 {v0.d, v1.d}[0], [x2], x3
+ ld2 {v0.s, v1.s}[2], [x2], 8
+ ld2 {v0.h, v1.h}[6], [x2], x4
+ ld2 {v0.b, v1.b}[14], [x2], 2
+ ld2 {v0.b, v1.b}[15], [x2]
+ addv b2, v0.16b
+ addv b3, v1.16b
+ mov x5, v2.d[0]
+ mov x6, v3.d[0]
+ cmp x5, #221
+ bne .Lfailure
+ cmp x6, #307
+ bne .Lfailure
+
+ mov x2, x0
+ ld3 {v0.s, v1.s, v2.s}[0], [x2], 12
+ ld3 {v0.s, v1.s, v2.s}[1], [x2]
+ mov x2, x0
+ mov x3, #12
+ ld3 {v0.s, v1.s, v2.s}[2], [x2], x3
+ ld3 {v0.s, v1.s, v2.s}[3], [x2]
+ addv b3, v0.16b
+ addv b4, v1.16b
+ addv b5, v2.16b
+ mov x4, v3.d[0]
+ mov x5, v4.d[0]
+ mov x6, v5.d[0]
+ cmp x4, #136
+ bne .Lfailure
+ cmp x5, #200
+ bne .Lfailure
+ cmp x6, #264
+ bne .Lfailure
+
+ mov x2, x0
+ ld4 {v0.s, v1.s, v2.s, v3.s}[0], [x2], 16
+ ld4 {v0.s, v1.s, v2.s, v3.s}[1], [x2]
+ mov x2, x0
+ mov x3, #16
+ ld4 {v0.s, v1.s, v2.s, v3.s}[2], [x2], x3
+ ld4 {v0.s, v1.s, v2.s, v3.s}[3], [x2]
+ addv b4, v0.16b
+ addv b5, v1.16b
+ addv b6, v2.16b
+ addv b7, v3.16b
+ mov x4, v4.d[0]
+ mov x5, v5.d[0]
+ mov x6, v6.d[0]
+ mov x7, v7.d[0]
+ cmp x4, #168
+ bne .Lfailure
+ cmp x5, #232
+ bne .Lfailure
+ cmp x6, #296
+ bne .Lfailure
+ cmp x7, #360
+ bne .Lfailure
+
+ pass
+.Lfailure:
+ fail
diff --git a/sim/testsuite/sim/aarch64/ldnr.s b/sim/testsuite/sim/aarch64/ldnr.s
new file mode 100644
index 0000000..a4bfffa
--- /dev/null
+++ b/sim/testsuite/sim/aarch64/ldnr.s
@@ -0,0 +1,176 @@
+# mach: aarch64
+
+# Check the load single 1-element structure and replicate to all lanes insns:
+# ld1r, ld2r, ld3r, ld4r.
+# Check the addressing modes: no offset, post-index immediate offset,
+# post-index register offset.
+
+.include "testutils.inc"
+
+input:
+ .word 0x04030201
+ .word 0x08070605
+ .word 0x0c0b0a09
+ .word 0x100f0e0d
+input2:
+ .word 0x00000001
+ .word 0x00000002
+ .word 0x00000003
+ .word 0x00000004
+ .word 0x00000005
+ .word 0x00000006
+ .word 0x00000007
+ .word 0x00000008
+ .word 0x00000009
+ .word 0x0000000a
+ .word 0x0000000b
+ .word 0x0000000c
+
+ start
+ adrp x0, input
+ add x0, x0, :lo12:input
+ adrp x1, input2
+ add x1, x1, :lo12:input2
+
+ mov x2, x0
+ mov x3, #1
+ ld1r {v0.8b}, [x2], 1
+ ld1r {v1.16b}, [x2], x3
+ ld1r {v2.4h}, [x2], 2
+ ld1r {v3.8h}, [x2]
+ addv b0, v0.8b
+ addv b1, v1.16b
+ addv b2, v2.8b
+ addv b3, v3.16b
+ mov x2, v0.d[0]
+ mov x3, v1.d[0]
+ mov x4, v2.d[0]
+ mov x5, v3.d[0]
+ cmp x2, #8
+ bne .Lfailure
+ cmp x3, #32
+ bne .Lfailure
+ cmp x4, #28
+ bne .Lfailure
+ cmp x5, #88
+ bne .Lfailure
+
+ mov x2, x1
+ mov x3, #8
+ ld2r {v0.2s, v1.2s}, [x2], 8
+ ld2r {v2.4s, v3.4s}, [x2], x3
+ ld2r {v4.1d, v5.1d}, [x2], 16
+ ld2r {v6.2d, v7.2d}, [x2]
+ addp v0.2s, v0.2s, v1.2s
+ addv s2, v2.4s
+ addv s3, v3.4s
+ addp v4.2s, v4.2s, v5.2s
+ addv s6, v6.4s
+ addv s7, v7.4s
+ mov w2, v0.s[0]
+ mov w3, v0.s[1]
+ mov x4, v2.d[0]
+ mov x5, v3.d[0]
+ mov w6, v4.s[0]
+ mov w7, v4.s[1]
+ mov x8, v6.d[0]
+ mov x9, v7.d[0]
+ cmp w2, #2
+ bne .Lfailure
+ cmp w3, #4
+ bne .Lfailure
+ cmp x4, #12
+ bne .Lfailure
+ cmp x5, #16
+ bne .Lfailure
+ cmp w6, #11
+ bne .Lfailure
+ cmp w7, #15
+ bne .Lfailure
+ cmp x8, #38
+ bne .Lfailure
+ cmp x9, #46
+ bne .Lfailure
+
+ mov x2, x0
+ mov x3, #3
+ ld3r {v0.8b, v1.8b, v2.8b}, [x2], 3
+ ld3r {v3.8b, v4.8b, v5.8b}, [x2], x3
+ ld3r {v6.8b, v7.8b, v8.8b}, [x2]
+ addv b0, v0.8b
+ addv b1, v1.8b
+ addv b2, v2.8b
+ addv b3, v3.8b
+ addv b4, v4.8b
+ addv b5, v5.8b
+ addv b6, v6.8b
+ addv b7, v7.8b
+ addv b8, v8.8b
+ addv b9, v9.8b
+ mov x2, v0.d[0]
+ mov x3, v1.d[0]
+ mov x4, v2.d[0]
+ mov x5, v3.d[0]
+ mov x6, v4.d[0]
+ mov x7, v5.d[0]
+ mov x8, v6.d[0]
+ mov x9, v7.d[0]
+ mov x10, v8.d[0]
+ cmp x2, #8
+ bne .Lfailure
+ cmp x3, #16
+ bne .Lfailure
+ cmp x4, #24
+ bne .Lfailure
+ cmp x5, #32
+ bne .Lfailure
+ cmp x6, #40
+ bne .Lfailure
+ cmp x7, #48
+ bne .Lfailure
+ cmp x8, #56
+ bne .Lfailure
+ cmp x9, #64
+ bne .Lfailure
+ cmp x10, #72
+ bne .Lfailure
+
+ mov x2, x1
+ ld4r {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 16
+ ld4r {v4.4s, v5.4s, v6.4s, v7.4s}, [x2]
+ addv s0, v0.4s
+ addv s1, v1.4s
+ addv s2, v2.4s
+ addv s3, v3.4s
+ addv s4, v4.4s
+ addv s5, v5.4s
+ addv s6, v6.4s
+ addv s7, v7.4s
+ mov x2, v0.d[0]
+ mov x3, v1.d[0]
+ mov x4, v2.d[0]
+ mov x5, v3.d[0]
+ mov x6, v4.d[0]
+ mov x7, v5.d[0]
+ mov x8, v6.d[0]
+ mov x9, v7.d[0]
+ cmp x2, #4
+ bne .Lfailure
+ cmp x3, #8
+ bne .Lfailure
+ cmp x4, #12
+ bne .Lfailure
+ cmp x5, #16
+ bne .Lfailure
+ cmp x6, #20
+ bne .Lfailure
+ cmp x7, #24
+ bne .Lfailure
+ cmp x8, #28
+ bne .Lfailure
+ cmp x9, #32
+ bne .Lfailure
+
+ pass
+.Lfailure:
+ fail
diff --git a/sim/testsuite/sim/aarch64/stn_single.s b/sim/testsuite/sim/aarch64/stn_single.s
new file mode 100644
index 0000000..5527c84
--- /dev/null
+++ b/sim/testsuite/sim/aarch64/stn_single.s
@@ -0,0 +1,122 @@
+# mach: aarch64
+
+# Check the store single 1-element structure to one lane instructions:
+# st1, st2, st3, st4.
+# Check the addressing modes: no offset, post-index immediate offset,
+# post-index register offset.
+
+.include "testutils.inc"
+
+input:
+ .word 0x04030201
+ .word 0x08070605
+ .word 0x0c0b0a09
+ .word 0x100f0e0d
+ .word 0x14131211
+ .word 0x18171615
+ .word 0x1c1b1a19
+ .word 0x201f1e1d
+output:
+ .zero 64
+
+ start
+ adrp x0, input
+ add x0, x0, :lo12:input
+ adrp x1, output
+ add x1, x1, :lo12:output
+
+ mov x2, x0
+ ldr q0, [x2], 8
+ ldr q1, [x2]
+ mov x2, x0
+ ldr q2, [x2], 8
+ ldr q3, [x2]
+
+ mov x2, x1
+ mov x3, #1
+ mov x4, #4
+ st1 {v0.b}[0], [x2], 1
+ st1 {v0.b}[1], [x2], x3
+ st1 {v0.h}[1], [x2], 2
+ st1 {v0.s}[1], [x2], x4
+ st1 {v0.d}[1], [x2]
+ ldr q4, [x1]
+ addv b4, v4.16b
+ mov x5, v4.d[0]
+ cmp x5, #136
+ bne .Lfailure
+
+ mov x2, x1
+ mov x3, #16
+ mov x4, #4
+ st2 {v0.d, v1.d}[0], [x2], x3
+ st2 {v0.s, v1.s}[2], [x2], 8
+ st2 {v0.h, v1.h}[6], [x2], x4
+ st2 {v0.b, v1.b}[14], [x2], 2
+ st2 {v0.b, v1.b}[15], [x2]
+ mov x2, x1
+ ldr q4, [x2], 16
+ ldr q5, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ mov x5, v4.d[0]
+ mov x6, v5.d[0]
+ cmp x5, #136
+ bne .Lfailure
+ cmp x6, #264
+ bne .Lfailure
+
+ mov x2, x1
+ mov x3, #12
+ st3 {v0.s, v1.s, v2.s}[0], [x2], 12
+ st3 {v0.s, v1.s, v2.s}[1], [x2], x3
+ st3 {v0.s, v1.s, v2.s}[2], [x2], 12
+ st3 {v0.s, v1.s, v2.s}[3], [x2]
+ mov x2, x1
+ ldr q4, [x2], 16
+ ldr q5, [x2], 16
+ ldr q6, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ addv b6, v6.16b
+ mov x4, v4.d[0]
+ mov x5, v5.d[0]
+ mov x6, v6.d[0]
+ cmp x4, #88
+ bne .Lfailure
+ cmp x5, #200
+ bne .Lfailure
+ cmp x6, #248
+ bne .Lfailure
+
+ mov x2, x1
+ mov x3, #16
+ st4 {v0.s, v1.s, v2.s, v3.s}[0], [x2], 16
+ st4 {v0.s, v1.s, v2.s, v3.s}[1], [x2], x3
+ st4 {v0.s, v1.s, v2.s, v3.s}[2], [x2], 16
+ st4 {v0.s, v1.s, v2.s, v3.s}[3], [x2]
+ mov x2, x1
+ ldr q4, [x2], 16
+ ldr q5, [x2], 16
+ ldr q6, [x2], 16
+ ldr q7, [x2]
+ addv b4, v4.16b
+ addv b5, v5.16b
+ addv b6, v6.16b
+ addv b7, v7.16b
+ mov x4, v4.d[0]
+ mov x5, v5.d[0]
+ mov x6, v6.d[0]
+ mov x7, v7.d[0]
+ cmp x4, #104
+ bne .Lfailure
+ cmp x5, #168
+ bne .Lfailure
+ cmp x6, #232
+ bne .Lfailure
+ cmp x7, #296
+ bne .Lfailure
+
+ pass
+.Lfailure:
+ fail