diff options
-rw-r--r-- | sim/aarch64/ChangeLog | 12 | ||||
-rw-r--r-- | sim/aarch64/simulator.c | 551 | ||||
-rw-r--r-- | sim/testsuite/sim/aarch64/ChangeLog | 6 | ||||
-rw-r--r-- | sim/testsuite/sim/aarch64/ldn_single.s | 100 | ||||
-rw-r--r-- | sim/testsuite/sim/aarch64/ldnr.s | 176 | ||||
-rw-r--r-- | sim/testsuite/sim/aarch64/stn_single.s | 122 |
6 files changed, 698 insertions, 269 deletions
diff --git a/sim/aarch64/ChangeLog b/sim/aarch64/ChangeLog index f9a62e7..2a21fc3 100644 --- a/sim/aarch64/ChangeLog +++ b/sim/aarch64/ChangeLog @@ -1,3 +1,15 @@ +2017-02-14 Jim Wilson <jim.wilson@linaro.org> + + * simulator.c: (LDn_STn_SINGLE_LANE_AND_SIZE): New. + (do_vec_LDn_single, do_vec_STn_single): New. + (do_vec_LDnR): Add and set new nregs var. Replace switch on nregs with + loop over nregs using new var n. Add n times size to address in loop. + Add n to vd in loop. + (do_vec_load_store): Add comment for instruction bit 24. New var + single to hold instruction bit 24. Add new code to use single. Move + ldnr support inside single if statements. Fix ldnr register counts + inside post if statement. Change HALT_NYI calls to HALT_UNALLOC. + 2017-01-23 Jim Wilson <jim.wilson@linaro.org> * simulator.c (do_vec_compare): Add case 0x23 for CMTST. diff --git a/sim/aarch64/simulator.c b/sim/aarch64/simulator.c index a44e70a..403edb7 100644 --- a/sim/aarch64/simulator.c +++ b/sim/aarch64/simulator.c @@ -11560,284 +11560,246 @@ ST1_4 (sim_cpu *cpu, uint64_t address) vec_store (cpu, address, 4); } +#define LDn_STn_SINGLE_LANE_AND_SIZE() \ + do \ + { \ + switch (INSTR (15, 14)) \ + { \ + case 0: \ + lane = (full << 3) | (s << 2) | size; \ + size = 0; \ + break; \ + \ + case 1: \ + if ((size & 1) == 1) \ + HALT_UNALLOC; \ + lane = (full << 2) | (s << 1) | (size >> 1); \ + size = 1; \ + break; \ + \ + case 2: \ + if ((size & 2) == 2) \ + HALT_UNALLOC; \ + \ + if ((size & 1) == 0) \ + { \ + lane = (full << 1) | s; \ + size = 2; \ + } \ + else \ + { \ + if (s) \ + HALT_UNALLOC; \ + lane = full; \ + size = 3; \ + } \ + break; \ + \ + default: \ + HALT_UNALLOC; \ + } \ + } \ + while (0) + +/* Load single structure into one lane of N registers. */ static void -do_vec_LDnR (sim_cpu *cpu, uint64_t address) +do_vec_LDn_single (sim_cpu *cpu, uint64_t address) { /* instr[31] = 0 instr[30] = element selector 0=>half, 1=>all elements instr[29,24] = 00 1101 instr[23] = 0=>simple, 1=>post instr[22] = 1 - instr[21] = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1) + instr[21] = width: LD1-or-LD3 (0) / LD2-or-LD4 (1) instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP), 11111 (immediate post inc) - instr[15,14] = 11 - instr[13] = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1) - instr[12] = 0 - instr[11,10] = element size 00=> byte(b), 01=> half(h), - 10=> word(s), 11=> double(d) + instr[15,13] = opcode + instr[12] = S, used for lane number + instr[11,10] = size, also used for lane number instr[9,5] = address instr[4,0] = Vd */ unsigned full = INSTR (30, 30); unsigned vd = INSTR (4, 0); unsigned size = INSTR (11, 10); + unsigned s = INSTR (12, 12); + int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1; + int lane = 0; int i; NYI_assert (29, 24, 0x0D); NYI_assert (22, 22, 1); - NYI_assert (15, 14, 3); - NYI_assert (12, 12, 0); - switch ((INSTR (13, 13) << 1) | INSTR (21, 21)) - { - case 0: /* LD1R. */ - switch (size) - { - case 0: - { - uint8_t val = aarch64_get_mem_u8 (cpu, address); - for (i = 0; i < (full ? 16 : 8); i++) - aarch64_set_vec_u8 (cpu, vd, i, val); - break; - } - - case 1: - { - uint16_t val = aarch64_get_mem_u16 (cpu, address); - for (i = 0; i < (full ? 8 : 4); i++) - aarch64_set_vec_u16 (cpu, vd, i, val); - break; - } - - case 2: - { - uint32_t val = aarch64_get_mem_u32 (cpu, address); - for (i = 0; i < (full ? 4 : 2); i++) - aarch64_set_vec_u32 (cpu, vd, i, val); - break; - } - - case 3: - { - uint64_t val = aarch64_get_mem_u64 (cpu, address); - for (i = 0; i < (full ? 2 : 1); i++) - aarch64_set_vec_u64 (cpu, vd, i, val); - break; - } + /* Compute the lane number first (using size), and then compute size. */ + LDn_STn_SINGLE_LANE_AND_SIZE (); - default: - HALT_UNALLOC; + for (i = 0; i < nregs; i++) + switch (size) + { + case 0: + { + uint8_t val = aarch64_get_mem_u8 (cpu, address + i); + aarch64_set_vec_u8 (cpu, vd + i, lane, val); + break; } - break; - case 1: /* LD2R. */ - switch (size) + case 1: { - case 0: - { - uint8_t val1 = aarch64_get_mem_u8 (cpu, address); - uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1); - - for (i = 0; i < (full ? 16 : 8); i++) - { - aarch64_set_vec_u8 (cpu, vd, 0, val1); - aarch64_set_vec_u8 (cpu, vd + 1, 0, val2); - } - break; - } + uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2)); + aarch64_set_vec_u16 (cpu, vd + i, lane, val); + break; + } - case 1: - { - uint16_t val1 = aarch64_get_mem_u16 (cpu, address); - uint16_t val2 = aarch64_get_mem_u16 (cpu, address + 2); + case 2: + { + uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4)); + aarch64_set_vec_u32 (cpu, vd + i, lane, val); + break; + } - for (i = 0; i < (full ? 8 : 4); i++) - { - aarch64_set_vec_u16 (cpu, vd, 0, val1); - aarch64_set_vec_u16 (cpu, vd + 1, 0, val2); - } - break; - } + case 3: + { + uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8)); + aarch64_set_vec_u64 (cpu, vd + i, lane, val); + break; + } + } +} - case 2: - { - uint32_t val1 = aarch64_get_mem_u32 (cpu, address); - uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4); +/* Store single structure from one lane from N registers. */ +static void +do_vec_STn_single (sim_cpu *cpu, uint64_t address) +{ + /* instr[31] = 0 + instr[30] = element selector 0=>half, 1=>all elements + instr[29,24] = 00 1101 + instr[23] = 0=>simple, 1=>post + instr[22] = 0 + instr[21] = width: LD1-or-LD3 (0) / LD2-or-LD4 (1) + instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP), + 11111 (immediate post inc) + instr[15,13] = opcode + instr[12] = S, used for lane number + instr[11,10] = size, also used for lane number + instr[9,5] = address + instr[4,0] = Vd */ - for (i = 0; i < (full ? 4 : 2); i++) - { - aarch64_set_vec_u32 (cpu, vd, 0, val1); - aarch64_set_vec_u32 (cpu, vd + 1, 0, val2); - } - break; - } + unsigned full = INSTR (30, 30); + unsigned vd = INSTR (4, 0); + unsigned size = INSTR (11, 10); + unsigned s = INSTR (12, 12); + int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1; + int lane = 0; + int i; - case 3: - { - uint64_t val1 = aarch64_get_mem_u64 (cpu, address); - uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8); + NYI_assert (29, 24, 0x0D); + NYI_assert (22, 22, 0); - for (i = 0; i < (full ? 2 : 1); i++) - { - aarch64_set_vec_u64 (cpu, vd, 0, val1); - aarch64_set_vec_u64 (cpu, vd + 1, 0, val2); - } - break; - } + /* Compute the lane number first (using size), and then compute size. */ + LDn_STn_SINGLE_LANE_AND_SIZE (); - default: - HALT_UNALLOC; + for (i = 0; i < nregs; i++) + switch (size) + { + case 0: + { + uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane); + aarch64_set_mem_u8 (cpu, address + i, val); + break; } - break; - case 2: /* LD3R. */ - switch (size) + case 1: { - case 0: - { - uint8_t val1 = aarch64_get_mem_u8 (cpu, address); - uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1); - uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2); - - for (i = 0; i < (full ? 16 : 8); i++) - { - aarch64_set_vec_u8 (cpu, vd, 0, val1); - aarch64_set_vec_u8 (cpu, vd + 1, 0, val2); - aarch64_set_vec_u8 (cpu, vd + 2, 0, val3); - } - } + uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane); + aarch64_set_mem_u16 (cpu, address + (i * 2), val); break; + } - case 1: - { - uint32_t val1 = aarch64_get_mem_u16 (cpu, address); - uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2); - uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4); - - for (i = 0; i < (full ? 8 : 4); i++) - { - aarch64_set_vec_u16 (cpu, vd, 0, val1); - aarch64_set_vec_u16 (cpu, vd + 1, 0, val2); - aarch64_set_vec_u16 (cpu, vd + 2, 0, val3); - } - } + case 2: + { + uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane); + aarch64_set_mem_u32 (cpu, address + (i * 4), val); break; + } - case 2: - { - uint32_t val1 = aarch64_get_mem_u32 (cpu, address); - uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4); - uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8); - - for (i = 0; i < (full ? 4 : 2); i++) - { - aarch64_set_vec_u32 (cpu, vd, 0, val1); - aarch64_set_vec_u32 (cpu, vd + 1, 0, val2); - aarch64_set_vec_u32 (cpu, vd + 2, 0, val3); - } - } + case 3: + { + uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane); + aarch64_set_mem_u64 (cpu, address + (i * 8), val); break; + } + } +} - case 3: - { - uint64_t val1 = aarch64_get_mem_u64 (cpu, address); - uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8); - uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16); +/* Load single structure into all lanes of N registers. */ +static void +do_vec_LDnR (sim_cpu *cpu, uint64_t address) +{ + /* instr[31] = 0 + instr[30] = element selector 0=>half, 1=>all elements + instr[29,24] = 00 1101 + instr[23] = 0=>simple, 1=>post + instr[22] = 1 + instr[21] = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1) + instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP), + 11111 (immediate post inc) + instr[15,14] = 11 + instr[13] = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1) + instr[12] = 0 + instr[11,10] = element size 00=> byte(b), 01=> half(h), + 10=> word(s), 11=> double(d) + instr[9,5] = address + instr[4,0] = Vd */ - for (i = 0; i < (full ? 2 : 1); i++) - { - aarch64_set_vec_u64 (cpu, vd, 0, val1); - aarch64_set_vec_u64 (cpu, vd + 1, 0, val2); - aarch64_set_vec_u64 (cpu, vd + 2, 0, val3); - } - } - break; + unsigned full = INSTR (30, 30); + unsigned vd = INSTR (4, 0); + unsigned size = INSTR (11, 10); + int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1; + int i, n; - default: - HALT_UNALLOC; - } - break; + NYI_assert (29, 24, 0x0D); + NYI_assert (22, 22, 1); + NYI_assert (15, 14, 3); + NYI_assert (12, 12, 0); - case 3: /* LD4R. */ - switch (size) + for (n = 0; n < nregs; n++) + switch (size) + { + case 0: { - case 0: - { - uint8_t val1 = aarch64_get_mem_u8 (cpu, address); - uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1); - uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2); - uint8_t val4 = aarch64_get_mem_u8 (cpu, address + 3); - - for (i = 0; i < (full ? 16 : 8); i++) - { - aarch64_set_vec_u8 (cpu, vd, 0, val1); - aarch64_set_vec_u8 (cpu, vd + 1, 0, val2); - aarch64_set_vec_u8 (cpu, vd + 2, 0, val3); - aarch64_set_vec_u8 (cpu, vd + 3, 0, val4); - } - } + uint8_t val = aarch64_get_mem_u8 (cpu, address + n); + for (i = 0; i < (full ? 16 : 8); i++) + aarch64_set_vec_u8 (cpu, vd + n, i, val); break; + } - case 1: - { - uint32_t val1 = aarch64_get_mem_u16 (cpu, address); - uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2); - uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4); - uint32_t val4 = aarch64_get_mem_u16 (cpu, address + 6); - - for (i = 0; i < (full ? 8 : 4); i++) - { - aarch64_set_vec_u16 (cpu, vd, 0, val1); - aarch64_set_vec_u16 (cpu, vd + 1, 0, val2); - aarch64_set_vec_u16 (cpu, vd + 2, 0, val3); - aarch64_set_vec_u16 (cpu, vd + 3, 0, val4); - } - } + case 1: + { + uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2)); + for (i = 0; i < (full ? 8 : 4); i++) + aarch64_set_vec_u16 (cpu, vd + n, i, val); break; + } - case 2: - { - uint32_t val1 = aarch64_get_mem_u32 (cpu, address); - uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4); - uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8); - uint32_t val4 = aarch64_get_mem_u32 (cpu, address + 12); - - for (i = 0; i < (full ? 4 : 2); i++) - { - aarch64_set_vec_u32 (cpu, vd, 0, val1); - aarch64_set_vec_u32 (cpu, vd + 1, 0, val2); - aarch64_set_vec_u32 (cpu, vd + 2, 0, val3); - aarch64_set_vec_u32 (cpu, vd + 3, 0, val4); - } - } + case 2: + { + uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4)); + for (i = 0; i < (full ? 4 : 2); i++) + aarch64_set_vec_u32 (cpu, vd + n, i, val); break; + } - case 3: - { - uint64_t val1 = aarch64_get_mem_u64 (cpu, address); - uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8); - uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16); - uint64_t val4 = aarch64_get_mem_u64 (cpu, address + 24); - - for (i = 0; i < (full ? 2 : 1); i++) - { - aarch64_set_vec_u64 (cpu, vd, 0, val1); - aarch64_set_vec_u64 (cpu, vd + 1, 0, val2); - aarch64_set_vec_u64 (cpu, vd + 2, 0, val3); - aarch64_set_vec_u64 (cpu, vd + 3, 0, val4); - } - } + case 3: + { + uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8)); + for (i = 0; i < (full ? 2 : 1); i++) + aarch64_set_vec_u64 (cpu, vd + n, i, val); break; - - default: - HALT_UNALLOC; } - break; - default: - HALT_UNALLOC; - } + default: + HALT_UNALLOC; + } } static void @@ -11848,7 +11810,7 @@ do_vec_load_store (sim_cpu *cpu) instr[31] = 0 instr[30] = element selector 0=>half, 1=>all elements instr[29,25] = 00110 - instr[24] = ? + instr[24] = 0=>multiple struct, 1=>single struct instr[23] = 0=>simple, 1=>post instr[22] = 0=>store, 1=>load instr[21] = 0 (LDn) / small(0)-large(1) selector (LDnR) @@ -11876,6 +11838,7 @@ do_vec_load_store (sim_cpu *cpu) instr[9,5] = Vn, can be SP instr[4,0] = Vd */ + int single; int post; int load; unsigned vn; @@ -11885,15 +11848,16 @@ do_vec_load_store (sim_cpu *cpu) if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06) HALT_NYI; - type = INSTR (15, 12); - if (type != 0xE && type != 0xE && INSTR (21, 21) != 0) - HALT_NYI; - + single = INSTR (24, 24); post = INSTR (23, 23); load = INSTR (22, 22); + type = INSTR (15, 12); vn = INSTR (9, 5); address = aarch64_get_reg_u64 (cpu, vn, SP_OK); + if (! single && INSTR (21, 21) != 0) + HALT_UNALLOC; + if (post) { unsigned vm = INSTR (20, 16); @@ -11902,49 +11866,78 @@ do_vec_load_store (sim_cpu *cpu) { unsigned sizeof_operation; - switch (type) + if (single) { - case 0: sizeof_operation = 32; break; - case 4: sizeof_operation = 24; break; - case 8: sizeof_operation = 16; break; - - case 0xC: - sizeof_operation = INSTR (21, 21) ? 2 : 1; - sizeof_operation <<= INSTR (11, 10); - break; + if ((type >= 0) && (type <= 11)) + { + int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1; + switch (INSTR (15, 14)) + { + case 0: + sizeof_operation = nregs * 1; + break; + case 1: + sizeof_operation = nregs * 2; + break; + case 2: + if (INSTR (10, 10) == 0) + sizeof_operation = nregs * 4; + else + sizeof_operation = nregs * 8; + break; + default: + HALT_UNALLOC; + } + } + else if (type == 0xC) + { + sizeof_operation = INSTR (21, 21) ? 2 : 1; + sizeof_operation <<= INSTR (11, 10); + } + else if (type == 0xE) + { + sizeof_operation = INSTR (21, 21) ? 4 : 3; + sizeof_operation <<= INSTR (11, 10); + } + else + HALT_UNALLOC; + } + else + { + switch (type) + { + case 0: sizeof_operation = 32; break; + case 4: sizeof_operation = 24; break; + case 8: sizeof_operation = 16; break; - case 0xE: - sizeof_operation = INSTR (21, 21) ? 8 : 4; - sizeof_operation <<= INSTR (11, 10); - break; + case 7: + /* One register, immediate offset variant. */ + sizeof_operation = 8; + break; - case 7: - /* One register, immediate offset variant. */ - sizeof_operation = 8; - break; + case 10: + /* Two registers, immediate offset variant. */ + sizeof_operation = 16; + break; - case 10: - /* Two registers, immediate offset variant. */ - sizeof_operation = 16; - break; + case 6: + /* Three registers, immediate offset variant. */ + sizeof_operation = 24; + break; - case 6: - /* Three registers, immediate offset variant. */ - sizeof_operation = 24; - break; + case 2: + /* Four registers, immediate offset variant. */ + sizeof_operation = 32; + break; - case 2: - /* Four registers, immediate offset variant. */ - sizeof_operation = 32; - break; + default: + HALT_UNALLOC; + } - default: - HALT_UNALLOC; + if (INSTR (30, 30)) + sizeof_operation *= 2; } - if (INSTR (30, 30)) - sizeof_operation *= 2; - aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation); } else @@ -11956,6 +11949,29 @@ do_vec_load_store (sim_cpu *cpu) NYI_assert (20, 16, 0); } + if (single) + { + if (load) + { + if ((type >= 0) && (type <= 11)) + do_vec_LDn_single (cpu, address); + else if ((type == 0xC) || (type == 0xE)) + do_vec_LDnR (cpu, address); + else + HALT_UNALLOC; + return; + } + + /* Stores. */ + if ((type >= 0) && (type <= 11)) + { + do_vec_STn_single (cpu, address); + return; + } + + HALT_UNALLOC; + } + if (load) { switch (type) @@ -11968,11 +11984,8 @@ do_vec_load_store (sim_cpu *cpu) case 10: LD1_2 (cpu, address); return; case 7: LD1_1 (cpu, address); return; - case 0xE: - case 0xC: do_vec_LDnR (cpu, address); return; - default: - HALT_NYI; + HALT_UNALLOC; } } @@ -11987,7 +12000,7 @@ do_vec_load_store (sim_cpu *cpu) case 10: ST1_2 (cpu, address); return; case 7: ST1_1 (cpu, address); return; default: - HALT_NYI; + HALT_UNALLOC; } } diff --git a/sim/testsuite/sim/aarch64/ChangeLog b/sim/testsuite/sim/aarch64/ChangeLog index 6a39354..86940e2 100644 --- a/sim/testsuite/sim/aarch64/ChangeLog +++ b/sim/testsuite/sim/aarch64/ChangeLog @@ -1,3 +1,9 @@ +2017-02-14 Jim Wilson <jim.wilson@linaro.org> + + * ldn_single.s: New. + * ldnr.s: New. + * stn_single.s: New. + 2017-01-23 Jim Wilson <jim.wilson@linaro.org> * cmtst.s: New. diff --git a/sim/testsuite/sim/aarch64/ldn_single.s b/sim/testsuite/sim/aarch64/ldn_single.s new file mode 100644 index 0000000..3102e9e --- /dev/null +++ b/sim/testsuite/sim/aarch64/ldn_single.s @@ -0,0 +1,100 @@ +# mach: aarch64 + +# Check the load single 1-element structure to one lane instructions: +# ld1, ld2, ld3, ld4. +# Check the addressing modes: no offset, post-index immediate offset, +# post-index register offset. + +.include "testutils.inc" + +input: + .word 0x04030201 + .word 0x08070605 + .word 0x0c0b0a09 + .word 0x100f0e0d + .word 0x14131211 + .word 0x18171615 + .word 0x1c1b1a19 + .word 0x201f1e1d + + start + adrp x0, input + add x0, x0, :lo12:input + + mov x2, x0 + mov x3, #1 + mov x4, #4 + ld1 {v0.b}[0], [x2], 1 + ld1 {v0.b}[1], [x2], x3 + ld1 {v0.h}[1], [x2], 2 + ld1 {v0.s}[1], [x2], x4 + ld1 {v0.d}[1], [x2] + addv b1, v0.16b + mov x5, v1.d[0] + cmp x5, #136 + bne .Lfailure + + mov x2, x0 + mov x3, #16 + mov x4, #4 + ld2 {v0.d, v1.d}[0], [x2], x3 + ld2 {v0.s, v1.s}[2], [x2], 8 + ld2 {v0.h, v1.h}[6], [x2], x4 + ld2 {v0.b, v1.b}[14], [x2], 2 + ld2 {v0.b, v1.b}[15], [x2] + addv b2, v0.16b + addv b3, v1.16b + mov x5, v2.d[0] + mov x6, v3.d[0] + cmp x5, #221 + bne .Lfailure + cmp x6, #307 + bne .Lfailure + + mov x2, x0 + ld3 {v0.s, v1.s, v2.s}[0], [x2], 12 + ld3 {v0.s, v1.s, v2.s}[1], [x2] + mov x2, x0 + mov x3, #12 + ld3 {v0.s, v1.s, v2.s}[2], [x2], x3 + ld3 {v0.s, v1.s, v2.s}[3], [x2] + addv b3, v0.16b + addv b4, v1.16b + addv b5, v2.16b + mov x4, v3.d[0] + mov x5, v4.d[0] + mov x6, v5.d[0] + cmp x4, #136 + bne .Lfailure + cmp x5, #200 + bne .Lfailure + cmp x6, #264 + bne .Lfailure + + mov x2, x0 + ld4 {v0.s, v1.s, v2.s, v3.s}[0], [x2], 16 + ld4 {v0.s, v1.s, v2.s, v3.s}[1], [x2] + mov x2, x0 + mov x3, #16 + ld4 {v0.s, v1.s, v2.s, v3.s}[2], [x2], x3 + ld4 {v0.s, v1.s, v2.s, v3.s}[3], [x2] + addv b4, v0.16b + addv b5, v1.16b + addv b6, v2.16b + addv b7, v3.16b + mov x4, v4.d[0] + mov x5, v5.d[0] + mov x6, v6.d[0] + mov x7, v7.d[0] + cmp x4, #168 + bne .Lfailure + cmp x5, #232 + bne .Lfailure + cmp x6, #296 + bne .Lfailure + cmp x7, #360 + bne .Lfailure + + pass +.Lfailure: + fail diff --git a/sim/testsuite/sim/aarch64/ldnr.s b/sim/testsuite/sim/aarch64/ldnr.s new file mode 100644 index 0000000..a4bfffa --- /dev/null +++ b/sim/testsuite/sim/aarch64/ldnr.s @@ -0,0 +1,176 @@ +# mach: aarch64 + +# Check the load single 1-element structure and replicate to all lanes insns: +# ld1r, ld2r, ld3r, ld4r. +# Check the addressing modes: no offset, post-index immediate offset, +# post-index register offset. + +.include "testutils.inc" + +input: + .word 0x04030201 + .word 0x08070605 + .word 0x0c0b0a09 + .word 0x100f0e0d +input2: + .word 0x00000001 + .word 0x00000002 + .word 0x00000003 + .word 0x00000004 + .word 0x00000005 + .word 0x00000006 + .word 0x00000007 + .word 0x00000008 + .word 0x00000009 + .word 0x0000000a + .word 0x0000000b + .word 0x0000000c + + start + adrp x0, input + add x0, x0, :lo12:input + adrp x1, input2 + add x1, x1, :lo12:input2 + + mov x2, x0 + mov x3, #1 + ld1r {v0.8b}, [x2], 1 + ld1r {v1.16b}, [x2], x3 + ld1r {v2.4h}, [x2], 2 + ld1r {v3.8h}, [x2] + addv b0, v0.8b + addv b1, v1.16b + addv b2, v2.8b + addv b3, v3.16b + mov x2, v0.d[0] + mov x3, v1.d[0] + mov x4, v2.d[0] + mov x5, v3.d[0] + cmp x2, #8 + bne .Lfailure + cmp x3, #32 + bne .Lfailure + cmp x4, #28 + bne .Lfailure + cmp x5, #88 + bne .Lfailure + + mov x2, x1 + mov x3, #8 + ld2r {v0.2s, v1.2s}, [x2], 8 + ld2r {v2.4s, v3.4s}, [x2], x3 + ld2r {v4.1d, v5.1d}, [x2], 16 + ld2r {v6.2d, v7.2d}, [x2] + addp v0.2s, v0.2s, v1.2s + addv s2, v2.4s + addv s3, v3.4s + addp v4.2s, v4.2s, v5.2s + addv s6, v6.4s + addv s7, v7.4s + mov w2, v0.s[0] + mov w3, v0.s[1] + mov x4, v2.d[0] + mov x5, v3.d[0] + mov w6, v4.s[0] + mov w7, v4.s[1] + mov x8, v6.d[0] + mov x9, v7.d[0] + cmp w2, #2 + bne .Lfailure + cmp w3, #4 + bne .Lfailure + cmp x4, #12 + bne .Lfailure + cmp x5, #16 + bne .Lfailure + cmp w6, #11 + bne .Lfailure + cmp w7, #15 + bne .Lfailure + cmp x8, #38 + bne .Lfailure + cmp x9, #46 + bne .Lfailure + + mov x2, x0 + mov x3, #3 + ld3r {v0.8b, v1.8b, v2.8b}, [x2], 3 + ld3r {v3.8b, v4.8b, v5.8b}, [x2], x3 + ld3r {v6.8b, v7.8b, v8.8b}, [x2] + addv b0, v0.8b + addv b1, v1.8b + addv b2, v2.8b + addv b3, v3.8b + addv b4, v4.8b + addv b5, v5.8b + addv b6, v6.8b + addv b7, v7.8b + addv b8, v8.8b + addv b9, v9.8b + mov x2, v0.d[0] + mov x3, v1.d[0] + mov x4, v2.d[0] + mov x5, v3.d[0] + mov x6, v4.d[0] + mov x7, v5.d[0] + mov x8, v6.d[0] + mov x9, v7.d[0] + mov x10, v8.d[0] + cmp x2, #8 + bne .Lfailure + cmp x3, #16 + bne .Lfailure + cmp x4, #24 + bne .Lfailure + cmp x5, #32 + bne .Lfailure + cmp x6, #40 + bne .Lfailure + cmp x7, #48 + bne .Lfailure + cmp x8, #56 + bne .Lfailure + cmp x9, #64 + bne .Lfailure + cmp x10, #72 + bne .Lfailure + + mov x2, x1 + ld4r {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 16 + ld4r {v4.4s, v5.4s, v6.4s, v7.4s}, [x2] + addv s0, v0.4s + addv s1, v1.4s + addv s2, v2.4s + addv s3, v3.4s + addv s4, v4.4s + addv s5, v5.4s + addv s6, v6.4s + addv s7, v7.4s + mov x2, v0.d[0] + mov x3, v1.d[0] + mov x4, v2.d[0] + mov x5, v3.d[0] + mov x6, v4.d[0] + mov x7, v5.d[0] + mov x8, v6.d[0] + mov x9, v7.d[0] + cmp x2, #4 + bne .Lfailure + cmp x3, #8 + bne .Lfailure + cmp x4, #12 + bne .Lfailure + cmp x5, #16 + bne .Lfailure + cmp x6, #20 + bne .Lfailure + cmp x7, #24 + bne .Lfailure + cmp x8, #28 + bne .Lfailure + cmp x9, #32 + bne .Lfailure + + pass +.Lfailure: + fail diff --git a/sim/testsuite/sim/aarch64/stn_single.s b/sim/testsuite/sim/aarch64/stn_single.s new file mode 100644 index 0000000..5527c84 --- /dev/null +++ b/sim/testsuite/sim/aarch64/stn_single.s @@ -0,0 +1,122 @@ +# mach: aarch64 + +# Check the store single 1-element structure to one lane instructions: +# st1, st2, st3, st4. +# Check the addressing modes: no offset, post-index immediate offset, +# post-index register offset. + +.include "testutils.inc" + +input: + .word 0x04030201 + .word 0x08070605 + .word 0x0c0b0a09 + .word 0x100f0e0d + .word 0x14131211 + .word 0x18171615 + .word 0x1c1b1a19 + .word 0x201f1e1d +output: + .zero 64 + + start + adrp x0, input + add x0, x0, :lo12:input + adrp x1, output + add x1, x1, :lo12:output + + mov x2, x0 + ldr q0, [x2], 8 + ldr q1, [x2] + mov x2, x0 + ldr q2, [x2], 8 + ldr q3, [x2] + + mov x2, x1 + mov x3, #1 + mov x4, #4 + st1 {v0.b}[0], [x2], 1 + st1 {v0.b}[1], [x2], x3 + st1 {v0.h}[1], [x2], 2 + st1 {v0.s}[1], [x2], x4 + st1 {v0.d}[1], [x2] + ldr q4, [x1] + addv b4, v4.16b + mov x5, v4.d[0] + cmp x5, #136 + bne .Lfailure + + mov x2, x1 + mov x3, #16 + mov x4, #4 + st2 {v0.d, v1.d}[0], [x2], x3 + st2 {v0.s, v1.s}[2], [x2], 8 + st2 {v0.h, v1.h}[6], [x2], x4 + st2 {v0.b, v1.b}[14], [x2], 2 + st2 {v0.b, v1.b}[15], [x2] + mov x2, x1 + ldr q4, [x2], 16 + ldr q5, [x2] + addv b4, v4.16b + addv b5, v5.16b + mov x5, v4.d[0] + mov x6, v5.d[0] + cmp x5, #136 + bne .Lfailure + cmp x6, #264 + bne .Lfailure + + mov x2, x1 + mov x3, #12 + st3 {v0.s, v1.s, v2.s}[0], [x2], 12 + st3 {v0.s, v1.s, v2.s}[1], [x2], x3 + st3 {v0.s, v1.s, v2.s}[2], [x2], 12 + st3 {v0.s, v1.s, v2.s}[3], [x2] + mov x2, x1 + ldr q4, [x2], 16 + ldr q5, [x2], 16 + ldr q6, [x2] + addv b4, v4.16b + addv b5, v5.16b + addv b6, v6.16b + mov x4, v4.d[0] + mov x5, v5.d[0] + mov x6, v6.d[0] + cmp x4, #88 + bne .Lfailure + cmp x5, #200 + bne .Lfailure + cmp x6, #248 + bne .Lfailure + + mov x2, x1 + mov x3, #16 + st4 {v0.s, v1.s, v2.s, v3.s}[0], [x2], 16 + st4 {v0.s, v1.s, v2.s, v3.s}[1], [x2], x3 + st4 {v0.s, v1.s, v2.s, v3.s}[2], [x2], 16 + st4 {v0.s, v1.s, v2.s, v3.s}[3], [x2] + mov x2, x1 + ldr q4, [x2], 16 + ldr q5, [x2], 16 + ldr q6, [x2], 16 + ldr q7, [x2] + addv b4, v4.16b + addv b5, v5.16b + addv b6, v6.16b + addv b7, v7.16b + mov x4, v4.d[0] + mov x5, v5.d[0] + mov x6, v6.d[0] + mov x7, v7.d[0] + cmp x4, #104 + bne .Lfailure + cmp x5, #168 + bne .Lfailure + cmp x6, #232 + bne .Lfailure + cmp x7, #296 + bne .Lfailure + + pass +.Lfailure: + fail |