diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2021-06-16 17:02:30 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2021-06-16 17:02:30 +0100 |
commit | 38848ce565849e5b867a5e08022b3c755039c11a (patch) | |
tree | 8e2f7b8f7d94069e3e33a7f87303acd7459932d7 | |
parent | e3897b75fd2ac8c4bfda95d60309cb6414da8000 (diff) | |
parent | 703235a303d6862a7e3f5c6aa9eff7471cb138b2 (diff) | |
download | qemu-38848ce565849e5b867a5e08022b3c755039c11a.zip qemu-38848ce565849e5b867a5e08022b3c755039c11a.tar.gz qemu-38848ce565849e5b867a5e08022b3c755039c11a.tar.bz2 |
Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20210616' into staging
target-arm queue:
* hw/intc/arm_gicv3_cpuif: Tolerate spurious EOIR writes
* handle some UNALLOCATED decode cases correctly rather
than asserting
* hw: virt: consider hw_compat_6_0
* hw/arm: add quanta-gbs-bmc machine
* hw/intc/armv7m_nvic: Remove stale comment
* target/arm: Fix mte page crossing test
* hw/arm: quanta-q71l add pca954x muxes
* target/arm: First few parts of MVE support
# gpg: Signature made Wed 16 Jun 2021 14:34:49 BST
# gpg: using RSA key E1A5C593CD419DE28E8315CF3C2525ED14360CDE
# gpg: issuer "peter.maydell@linaro.org"
# gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" [ultimate]
# gpg: aka "Peter Maydell <pmaydell@gmail.com>" [ultimate]
# gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" [ultimate]
# Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE
* remotes/pmaydell/tags/pull-target-arm-20210616: (25 commits)
include/qemu/int128.h: Add function to create Int128 from int64_t
bitops.h: Provide hswap32(), hswap64(), wswap64() swapping operations
target/arm: Move expand_pred_b() data to vec_helper.c
target/arm: Add framework for MVE decode
target/arm: Implement MVE LETP insn
target/arm: Implement MVE DLSTP
target/arm: Implement MVE WLSTP insn
target/arm: Implement MVE LCTP
target/arm: Let vfp_access_check() handle late NOCP checks
target/arm: Add handling for PSR.ECI/ICI
target/arm: Handle VPR semantics in existing code
target/arm: Enable FPSCR.QC bit for MVE
target/arm: Provide and use H8 and H1_8 macros
hw/arm: quanta-q71l add pca954x muxes
hw/arm: gsj add pca9548
hw/arm: gsj add i2c comments
target/arm: Fix mte page crossing test
hw/intc/armv7m_nvic: Remove stale comment
hw/arm: quanta-gbs-bmc add i2c comments
hw/arm: add quanta-gbs-bmc machine
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | hw/arm/Kconfig | 2 | ||||
-rw-r--r-- | hw/arm/aspeed.c | 11 | ||||
-rw-r--r-- | hw/arm/npcm7xx_boards.c | 107 | ||||
-rw-r--r-- | hw/arm/virt.c | 2 | ||||
-rw-r--r-- | hw/intc/arm_gicv3_cpuif.c | 5 | ||||
-rw-r--r-- | hw/intc/armv7m_nvic.c | 6 | ||||
-rw-r--r-- | include/qemu/bitops.h | 29 | ||||
-rw-r--r-- | include/qemu/int128.h | 10 | ||||
-rw-r--r-- | target/arm/m_helper.c | 54 | ||||
-rw-r--r-- | target/arm/meson.build | 2 | ||||
-rw-r--r-- | target/arm/mte_helper.c | 2 | ||||
-rw-r--r-- | target/arm/mve.decode | 20 | ||||
-rw-r--r-- | target/arm/sve_helper.c | 381 | ||||
-rw-r--r-- | target/arm/t32.decode | 15 | ||||
-rw-r--r-- | target/arm/translate-a32.h | 2 | ||||
-rw-r--r-- | target/arm/translate-a64.c | 83 | ||||
-rw-r--r-- | target/arm/translate-m-nocp.c | 16 | ||||
-rw-r--r-- | target/arm/translate-mve.c | 29 | ||||
-rw-r--r-- | target/arm/translate-vfp.c | 65 | ||||
-rw-r--r-- | target/arm/translate.c | 300 | ||||
-rw-r--r-- | target/arm/translate.h | 9 | ||||
-rw-r--r-- | target/arm/vec_helper.c | 116 | ||||
-rw-r--r-- | target/arm/vec_internal.h | 9 | ||||
-rw-r--r-- | target/arm/vfp_helper.c | 3 | ||||
-rw-r--r-- | tests/tcg/aarch64/Makefile.target | 2 | ||||
-rw-r--r-- | tests/tcg/aarch64/mte-7.c | 31 |
26 files changed, 965 insertions, 346 deletions
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index 67723d9..647b5c8 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -378,6 +378,7 @@ config NPCM7XX select SERIAL select SSI select UNIMP + select PCA954X config FSL_IMX25 bool @@ -413,6 +414,7 @@ config ASPEED_SOC select PCA9552 select SERIAL select SMBUS_EEPROM + select PCA954X select SSI select SSI_M25P80 select TMP105 diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c index 0eafc79..1301e8f 100644 --- a/hw/arm/aspeed.c +++ b/hw/arm/aspeed.c @@ -14,6 +14,7 @@ #include "hw/arm/boot.h" #include "hw/arm/aspeed.h" #include "hw/arm/aspeed_soc.h" +#include "hw/i2c/i2c_mux_pca954x.h" #include "hw/i2c/smbus_eeprom.h" #include "hw/misc/pca9552.h" #include "hw/misc/tmp105.h" @@ -461,14 +462,18 @@ static void quanta_q71l_bmc_i2c_init(AspeedMachineState *bmc) /* TODO: i2c-1: Add Frontpanel FRU eeprom@57 24c64 */ /* TODO: Add Memory Riser i2c mux and eeproms. */ - /* TODO: i2c-2: pca9546@74 */ - /* TODO: i2c-2: pca9548@77 */ + i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 2), "pca9546", 0x74); + i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 2), "pca9548", 0x77); + /* TODO: i2c-3: Add BIOS FRU eeprom@56 24c64 */ - /* TODO: i2c-7: Add pca9546@70 */ + + /* i2c-7 */ + i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 7), "pca9546", 0x70); /* - i2c@0: pmbus@59 */ /* - i2c@1: pmbus@58 */ /* - i2c@2: pmbus@58 */ /* - i2c@3: pmbus@59 */ + /* TODO: i2c-7: Add PDB FRU eeprom@52 */ /* TODO: i2c-8: Add BMC FRU eeprom@50 */ } diff --git a/hw/arm/npcm7xx_boards.c b/hw/arm/npcm7xx_boards.c index 698be46..e5a3243 100644 --- a/hw/arm/npcm7xx_boards.c +++ b/hw/arm/npcm7xx_boards.c @@ -18,6 +18,7 @@ #include "hw/arm/npcm7xx.h" #include "hw/core/cpu.h" +#include "hw/i2c/i2c_mux_pca954x.h" #include "hw/i2c/smbus_eeprom.h" #include "hw/loader.h" #include "hw/qdev-core.h" @@ -29,6 +30,7 @@ #define NPCM750_EVB_POWER_ON_STRAPS 0x00001ff7 #define QUANTA_GSJ_POWER_ON_STRAPS 0x00001fff +#define QUANTA_GBS_POWER_ON_STRAPS 0x000017ff static const char npcm7xx_default_bootrom[] = "npcm7xx_bootrom.bin"; @@ -220,7 +222,18 @@ static void quanta_gsj_i2c_init(NPCM7xxState *soc) at24c_eeprom_init(soc, 9, 0x55, 8192); at24c_eeprom_init(soc, 10, 0x55, 8192); - /* TODO: Add additional i2c devices. */ + /* + * i2c-11: + * - power-brick@36: delta,dps800 + * - hotswap@15: ti,lm5066i + */ + + /* + * i2c-12: + * - ucd90160@6b + */ + + i2c_slave_create_simple(npcm7xx_i2c_get_bus(soc, 15), "pca9548", 0x75); } static void quanta_gsj_fan_init(NPCM7xxMachine *machine, NPCM7xxState *soc) @@ -237,6 +250,65 @@ static void quanta_gsj_fan_init(NPCM7xxMachine *machine, NPCM7xxState *soc) npcm7xx_connect_pwm_fan(soc, &splitter[2], 0x05, 1); } +static void quanta_gbs_i2c_init(NPCM7xxState *soc) +{ + /* + * i2c-0: + * pca9546@71 + * + * i2c-1: + * pca9535@24 + * pca9535@20 + * pca9535@21 + * pca9535@22 + * pca9535@23 + * pca9535@25 + * pca9535@26 + * + * i2c-2: + * sbtsi@4c + * + * i2c-5: + * atmel,24c64@50 mb_fru + * pca9546@71 + * - channel 0: max31725@54 + * - channel 1: max31725@55 + * - channel 2: max31725@5d + * atmel,24c64@51 fan_fru + * - channel 3: atmel,24c64@52 hsbp_fru + * + * i2c-6: + * pca9545@73 + * + * i2c-7: + * pca9545@72 + * + * i2c-8: + * adi,adm1272@10 + * + * i2c-9: + * pca9546@71 + * - channel 0: isil,isl68137@60 + * - channel 1: isil,isl68137@61 + * - channel 2: isil,isl68137@63 + * - channel 3: isil,isl68137@45 + * + * i2c-10: + * pca9545@71 + * + * i2c-11: + * pca9545@76 + * + * i2c-12: + * maxim,max34451@4e + * isil,isl68137@5d + * isil,isl68137@5e + * + * i2c-14: + * pca9545@70 + */ +} + static void npcm750_evb_init(MachineState *machine) { NPCM7xxState *soc; @@ -268,6 +340,23 @@ static void quanta_gsj_init(MachineState *machine) npcm7xx_load_kernel(machine, soc); } +static void quanta_gbs_init(MachineState *machine) +{ + NPCM7xxState *soc; + + soc = npcm7xx_create_soc(machine, QUANTA_GBS_POWER_ON_STRAPS); + npcm7xx_connect_dram(soc, machine->ram); + qdev_realize(DEVICE(soc), NULL, &error_fatal); + + npcm7xx_load_bootrom(machine, soc); + + npcm7xx_connect_flash(&soc->fiu[0], 0, "mx66u51235f", + drive_get(IF_MTD, 0, 0)); + + quanta_gbs_i2c_init(soc); + npcm7xx_load_kernel(machine, soc); +} + static void npcm7xx_set_soc_type(NPCM7xxMachineClass *nmc, const char *type) { NPCM7xxClass *sc = NPCM7XX_CLASS(object_class_by_name(type)); @@ -316,6 +405,18 @@ static void gsj_machine_class_init(ObjectClass *oc, void *data) mc->default_ram_size = 512 * MiB; }; +static void gbs_bmc_machine_class_init(ObjectClass *oc, void *data) +{ + NPCM7xxMachineClass *nmc = NPCM7XX_MACHINE_CLASS(oc); + MachineClass *mc = MACHINE_CLASS(oc); + + npcm7xx_set_soc_type(nmc, TYPE_NPCM730); + + mc->desc = "Quanta GBS (Cortex-A9)"; + mc->init = quanta_gbs_init; + mc->default_ram_size = 1 * GiB; +} + static const TypeInfo npcm7xx_machine_types[] = { { .name = TYPE_NPCM7XX_MACHINE, @@ -332,6 +433,10 @@ static const TypeInfo npcm7xx_machine_types[] = { .name = MACHINE_TYPE_NAME("quanta-gsj"), .parent = TYPE_NPCM7XX_MACHINE, .class_init = gsj_machine_class_init, + }, { + .name = MACHINE_TYPE_NAME("quanta-gbs-bmc"), + .parent = TYPE_NPCM7XX_MACHINE, + .class_init = gbs_bmc_machine_class_init, }, }; diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 9122e22..4b96f06 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2766,6 +2766,8 @@ DEFINE_VIRT_MACHINE_AS_LATEST(6, 1) static void virt_machine_6_0_options(MachineClass *mc) { + virt_machine_6_1_options(mc); + compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len); } DEFINE_VIRT_MACHINE(6, 0) diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c index 81f94c7..3e0641a 100644 --- a/hw/intc/arm_gicv3_cpuif.c +++ b/hw/intc/arm_gicv3_cpuif.c @@ -14,6 +14,7 @@ #include "qemu/osdep.h" #include "qemu/bitops.h" +#include "qemu/log.h" #include "qemu/main-loop.h" #include "trace.h" #include "gicv3_internal.h" @@ -1357,7 +1358,9 @@ static void icc_eoir_write(CPUARMState *env, const ARMCPRegInfo *ri, } break; default: - g_assert_not_reached(); + qemu_log_mask(LOG_GUEST_ERROR, + "%s: IRQ %d isn't active\n", __func__, irq); + return; } icc_drop_prio(cs, grp); diff --git a/hw/intc/armv7m_nvic.c b/hw/intc/armv7m_nvic.c index c4287d8..94fe002 100644 --- a/hw/intc/armv7m_nvic.c +++ b/hw/intc/armv7m_nvic.c @@ -2941,12 +2941,6 @@ static void armv7m_nvic_realize(DeviceState *dev, Error **errp) static void armv7m_nvic_instance_init(Object *obj) { - /* We have a different default value for the num-irq property - * than our superclass. This function runs after qdev init - * has set the defaults from the Property array and before - * any user-specified property setting, so just modify the - * value in the GICState struct. - */ DeviceState *dev = DEVICE(obj); NVICState *nvic = NVIC(obj); SysBusDevice *sbd = SYS_BUS_DEVICE(obj); diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h index a72f69f..03213ce 100644 --- a/include/qemu/bitops.h +++ b/include/qemu/bitops.h @@ -292,6 +292,35 @@ static inline uint64_t ror64(uint64_t word, unsigned int shift) } /** + * hswap32 - swap 16-bit halfwords within a 32-bit value + * @h: value to swap + */ +static inline uint32_t hswap32(uint32_t h) +{ + return rol32(h, 16); +} + +/** + * hswap64 - swap 16-bit halfwords within a 64-bit value + * @h: value to swap + */ +static inline uint64_t hswap64(uint64_t h) +{ + uint64_t m = 0x0000ffff0000ffffull; + h = rol64(h, 32); + return ((h & m) << 16) | ((h >> 16) & m); +} + +/** + * wswap64 - swap 32-bit words within a 64-bit value + * @h: value to swap + */ +static inline uint64_t wswap64(uint64_t h) +{ + return rol64(h, 32); +} + +/** * extract32: * @value: the value to extract the bit field from * @start: the lowest bit in the bit field (numbered from 0) diff --git a/include/qemu/int128.h b/include/qemu/int128.h index 52fc238..6450038 100644 --- a/include/qemu/int128.h +++ b/include/qemu/int128.h @@ -11,6 +11,11 @@ static inline Int128 int128_make64(uint64_t a) return a; } +static inline Int128 int128_makes64(int64_t a) +{ + return a; +} + static inline Int128 int128_make128(uint64_t lo, uint64_t hi) { return (__uint128_t)hi << 64 | lo; @@ -167,6 +172,11 @@ static inline Int128 int128_make64(uint64_t a) return (Int128) { a, 0 }; } +static inline Int128 int128_makes64(int64_t a) +{ + return (Int128) { a, a >> 63 }; +} + static inline Int128 int128_make128(uint64_t lo, uint64_t hi) { return (Int128) { lo, hi }; diff --git a/target/arm/m_helper.c b/target/arm/m_helper.c index 074c543..7a1e35a 100644 --- a/target/arm/m_helper.c +++ b/target/arm/m_helper.c @@ -378,7 +378,7 @@ void HELPER(v7m_preserve_fp_state)(CPUARMState *env) uint32_t shi = extract64(dn, 32, 32); if (i >= 16) { - faddr += 8; /* skip the slot for the FPSCR */ + faddr += 8; /* skip the slot for the FPSCR/VPR */ } stacked_ok = stacked_ok && v7m_stack_write(cpu, faddr, slo, mmu_idx, STACK_LAZYFP) && @@ -388,6 +388,11 @@ void HELPER(v7m_preserve_fp_state)(CPUARMState *env) stacked_ok = stacked_ok && v7m_stack_write(cpu, fpcar + 0x40, vfp_get_fpscr(env), mmu_idx, STACK_LAZYFP); + if (cpu_isar_feature(aa32_mve, cpu)) { + stacked_ok = stacked_ok && + v7m_stack_write(cpu, fpcar + 0x44, + env->v7m.vpr, mmu_idx, STACK_LAZYFP); + } } /* @@ -410,16 +415,19 @@ void HELPER(v7m_preserve_fp_state)(CPUARMState *env) env->v7m.fpccr[is_secure] &= ~R_V7M_FPCCR_LSPACT_MASK; if (ts) { - /* Clear s0 to s31 and the FPSCR */ + /* Clear s0 to s31 and the FPSCR and VPR */ int i; for (i = 0; i < 32; i += 2) { *aa32_vfp_dreg(env, i / 2) = 0; } vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } } /* - * Otherwise s0 to s15 and FPSCR are UNKNOWN; we choose to leave them + * Otherwise s0 to s15, FPSCR and VPR are UNKNOWN; we choose to leave them * unchanged. */ } @@ -1044,6 +1052,7 @@ static void v7m_update_fpccr(CPUARMState *env, uint32_t frameptr, void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) { /* fptr is the value of Rn, the frame pointer we store the FP regs to */ + ARMCPU *cpu = env_archcpu(env); bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK; bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK; uintptr_t ra = GETPC(); @@ -1092,9 +1101,12 @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) cpu_stl_data_ra(env, faddr + 4, shi, ra); } cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra); + if (cpu_isar_feature(aa32_mve, cpu)) { + cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra); + } /* - * If TS is 0 then s0 to s15 and FPSCR are UNKNOWN; we choose to + * If TS is 0 then s0 to s15, FPSCR and VPR are UNKNOWN; we choose to * leave them unchanged, matching our choice in v7m_preserve_fp_state. */ if (ts) { @@ -1102,6 +1114,9 @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) *aa32_vfp_dreg(env, i / 2) = 0; } vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } } } else { v7m_update_fpccr(env, fptr, false); @@ -1112,6 +1127,7 @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr) { + ARMCPU *cpu = env_archcpu(env); uintptr_t ra = GETPC(); /* fptr is the value of Rn, the frame pointer we load the FP regs from */ @@ -1144,7 +1160,7 @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr) uint32_t faddr = fptr + 4 * i; if (i >= 16) { - faddr += 8; /* skip the slot for the FPSCR */ + faddr += 8; /* skip the slot for the FPSCR and VPR */ } slo = cpu_ldl_data_ra(env, faddr, ra); @@ -1155,6 +1171,9 @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr) } fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra); vfp_set_fpscr(env, fpscr); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra); + } } env->v7m.control[M_REG_S] |= R_V7M_CONTROL_FPCA_MASK; @@ -1298,7 +1317,7 @@ static bool v7m_push_stack(ARMCPU *cpu) uint32_t shi = extract64(dn, 32, 32); if (i >= 16) { - faddr += 8; /* skip the slot for the FPSCR */ + faddr += 8; /* skip the slot for the FPSCR and VPR */ } stacked_ok = stacked_ok && v7m_stack_write(cpu, faddr, slo, @@ -1309,11 +1328,19 @@ static bool v7m_push_stack(ARMCPU *cpu) stacked_ok = stacked_ok && v7m_stack_write(cpu, frameptr + 0x60, vfp_get_fpscr(env), mmu_idx, STACK_NORMAL); + if (cpu_isar_feature(aa32_mve, cpu)) { + stacked_ok = stacked_ok && + v7m_stack_write(cpu, frameptr + 0x64, + env->v7m.vpr, mmu_idx, STACK_NORMAL); + } if (cpacr_pass) { for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) { *aa32_vfp_dreg(env, i / 2) = 0; } vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } } } else { /* Lazy stacking enabled, save necessary info to stack later */ @@ -1536,13 +1563,16 @@ static void do_v7m_exception_exit(ARMCPU *cpu) v7m_exception_taken(cpu, excret, true, false); } } - /* Clear s0..s15 and FPSCR; TODO also VPR when MVE is implemented */ + /* Clear s0..s15, FPSCR and VPR */ int i; for (i = 0; i < 16; i += 2) { *aa32_vfp_dreg(env, i / 2) = 0; } vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } } } @@ -1771,7 +1801,7 @@ static void do_v7m_exception_exit(ARMCPU *cpu) uint32_t faddr = frameptr + 0x20 + 4 * i; if (i >= 16) { - faddr += 8; /* Skip the slot for the FPSCR */ + faddr += 8; /* Skip the slot for the FPSCR and VPR */ } pop_ok = pop_ok && @@ -1790,6 +1820,11 @@ static void do_v7m_exception_exit(ARMCPU *cpu) if (pop_ok) { vfp_set_fpscr(env, fpscr); } + if (cpu_isar_feature(aa32_mve, cpu)) { + pop_ok = pop_ok && + v7m_stack_read(cpu, &env->v7m.vpr, + frameptr + 0x64, mmu_idx); + } if (!pop_ok) { /* * These regs are 0 if security extension present; @@ -1799,6 +1834,9 @@ static void do_v7m_exception_exit(ARMCPU *cpu) *aa32_vfp_dreg(env, i / 2) = 0; } vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } } } } diff --git a/target/arm/meson.build b/target/arm/meson.build index 5bfaf43..2b50be3 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -6,6 +6,7 @@ gen = [ decodetree.process('vfp.decode', extra_args: '--decode=disas_vfp'), decodetree.process('vfp-uncond.decode', extra_args: '--decode=disas_vfp_uncond'), decodetree.process('m-nocp.decode', extra_args: '--decode=disas_m_nocp'), + decodetree.process('mve.decode', extra_args: '--decode=disas_mve'), decodetree.process('a32.decode', extra_args: '--static-decode=disas_a32'), decodetree.process('a32-uncond.decode', extra_args: '--static-decode=disas_a32_uncond'), decodetree.process('t32.decode', extra_args: '--static-decode=disas_t32'), @@ -27,6 +28,7 @@ arm_ss.add(files( 'tlb_helper.c', 'translate.c', 'translate-m-nocp.c', + 'translate-mve.c', 'translate-neon.c', 'translate-vfp.c', 'vec_helper.c', diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c index 166b9d2..9e615cc 100644 --- a/target/arm/mte_helper.c +++ b/target/arm/mte_helper.c @@ -730,7 +730,7 @@ static int mte_probe_int(CPUARMState *env, uint32_t desc, uint64_t ptr, prev_page = ptr & TARGET_PAGE_MASK; next_page = prev_page + TARGET_PAGE_SIZE; - if (likely(tag_last - prev_page <= TARGET_PAGE_SIZE)) { + if (likely(tag_last - prev_page < TARGET_PAGE_SIZE)) { /* Memory access stays on one page. */ tag_size = ((tag_byte_last - tag_byte_first) / (2 * TAG_GRANULE)) + 1; mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, sizem1 + 1, diff --git a/target/arm/mve.decode b/target/arm/mve.decode new file mode 100644 index 0000000..c8492bb --- /dev/null +++ b/target/arm/mve.decode @@ -0,0 +1,20 @@ +# M-profile MVE instruction descriptions +# +# Copyright (c) 2021 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index 46a957b..dab5f1d 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -103,108 +103,13 @@ uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) return flags; } -/* Expand active predicate bits to bytes, for byte elements. - * for (i = 0; i < 256; ++i) { - * unsigned long m = 0; - * for (j = 0; j < 8; j++) { - * if ((i >> j) & 1) { - * m |= 0xfful << (j << 3); - * } - * } - * printf("0x%016lx,\n", m); - * } +/* + * Expand active predicate bits to bytes, for byte elements. + * (The data table itself is in vec_helper.c as MVE also needs it.) */ static inline uint64_t expand_pred_b(uint8_t byte) { - static const uint64_t word[256] = { - 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, - 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, - 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, - 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, - 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, - 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, - 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, - 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, - 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, - 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, - 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, - 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, - 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, - 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, - 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, - 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, - 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, - 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, - 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, - 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, - 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, - 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, - 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, - 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, - 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, - 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, - 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, - 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, - 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, - 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, - 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, - 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, - 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, - 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, - 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, - 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, - 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, - 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, - 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, - 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, - 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, - 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, - 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, - 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, - 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, - 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, - 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, - 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, - 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, - 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, - 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, - 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, - 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, - 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, - 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, - 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, - 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, - 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, - 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, - 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, - 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, - 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, - 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, - 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, - 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, - 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, - 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, - 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, - 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, - 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, - 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, - 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, - 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, - 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, - 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, - 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, - 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, - 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, - 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, - 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, - 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, - 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, - 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, - 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, - 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, - 0xffffffffffffffff, - }; - return word[byte]; + return expand_pred_b_data[byte]; } /* Similarly for half-word elements. @@ -247,26 +152,6 @@ static inline uint64_t expand_pred_s(uint8_t byte) return word[byte & 0x11]; } -/* Swap 16-bit words within a 32-bit word. */ -static inline uint32_t hswap32(uint32_t h) -{ - return rol32(h, 16); -} - -/* Swap 16-bit words within a 64-bit word. */ -static inline uint64_t hswap64(uint64_t h) -{ - uint64_t m = 0x0000ffff0000ffffull; - h = rol64(h, 32); - return ((h & m) << 16) | ((h >> 16) & m); -} - -/* Swap 32-bit words within a 64-bit word. */ -static inline uint64_t wswap64(uint64_t h) -{ - return rol64(h, 32); -} - #define LOGICAL_PPPP(NAME, FUNC) \ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ { \ @@ -905,23 +790,23 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) -DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, , float64_add) +DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) -DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, , float64_maxnum) +DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) -DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, , float64_minnum) +DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) -DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, , float64_max) +DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) -DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, , float64_min) +DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) #undef DO_ZPZZ_PAIR_FP @@ -1171,35 +1056,35 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) -DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, , H1_4, DO_ADD) +DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) -DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, , H1_4, DO_SUB) +DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) -DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, , H1_4, DO_ABD) +DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) -DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, , H1_4, DO_ADD) +DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) -DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, , H1_4, DO_SUB) +DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) -DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, , H1_4, DO_ABD) +DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) -DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, , H1_4, DO_MUL) +DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) -DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, , H1_4, DO_MUL) +DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) /* Note that the multiply cannot overflow, but the doubling can. */ static inline int16_t do_sqdmull_h(int16_t n, int16_t m) @@ -1222,7 +1107,7 @@ static inline int64_t do_sqdmull_d(int64_t n, int64_t m) DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) -DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, , H1_4, do_sqdmull_d) +DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) #undef DO_ZZZ_TB @@ -1240,19 +1125,19 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) -DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, , H1_4, DO_ADD) +DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) -DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, , H1_4, DO_SUB) +DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) -DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, , H1_4, DO_ADD) +DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) -DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, , H1_4, DO_SUB) +DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) #undef DO_ZZZ_WTB @@ -1272,7 +1157,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) -DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR) +DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) #undef DO_ZZZ_NTB @@ -1291,29 +1176,29 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) -DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, , H1_4, DO_ABD) +DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) -DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, , H1_4, DO_ABD) +DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) -DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, , H1_4, DO_MUL) +DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) -DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, , H1_4, DO_MUL) +DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) #define DO_NMUL(N, M) -(N * M) DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) -DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, , H1_4, DO_NMUL) +DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) -DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, , H1_4, DO_NMUL) +DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) #undef DO_ZZZW_ACC @@ -1425,14 +1310,14 @@ DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h, DO_SQADD_H) DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s, DO_SQADD_S) -DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, , H1_4, +DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d, do_sqadd_d) DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h, DO_SQSUB_H) DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s, DO_SQSUB_S) -DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, , H1_4, +DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d, do_sqsub_d) #undef DO_SQDMLAL @@ -1460,7 +1345,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) -DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, , DO_CMLA) +DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) #define DO_SQRDMLAH_B(N, M, A, S) \ do_sqrdmlah_b(N, M, A, S, true) @@ -1474,7 +1359,7 @@ DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, , DO_CMLA) DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) -DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, , DO_SQRDMLAH_D) +DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ @@ -1632,7 +1517,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) -DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, , DO_SQRDMLAH_D) +DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) #define DO_SQRDMLSH_H(N, M, A) \ ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) @@ -1642,7 +1527,7 @@ DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, , DO_SQRDMLAH_D) DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) -DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, , DO_SQRDMLSH_D) +DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) #undef DO_ZZXZ @@ -1665,28 +1550,28 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ #define DO_MLA(N, M, A) (A + N * M) DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) -DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, , H1_4, DO_MLA) +DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) -DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, , H1_4, DO_MLA) +DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) #define DO_MLS(N, M, A) (A - N * M) DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) -DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, , H1_4, DO_MLS) +DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) -DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, , H1_4, DO_MLS) +DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) -DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, , H1_4, DO_SQDMLAL_D) +DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) -DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, , H1_4, DO_SQDMLSL_D) +DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) #undef DO_MLA #undef DO_MLS @@ -1708,13 +1593,13 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ } DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) -DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, , H1_4, do_sqdmull_d) +DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) -DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, , H1_4, DO_MUL) +DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) -DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, , H1_4, DO_MUL) +DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) #undef DO_ZZX @@ -1824,12 +1709,12 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) -DO_CADD(sve2_cadd_d, int64_t, , DO_ADD, DO_SUB) +DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) -DO_CADD(sve2_sqcadd_d, int64_t, , do_sqadd_d, do_sqsub_d) +DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) #undef DO_CADD @@ -1847,11 +1732,11 @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) -DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, , H1_4) +DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) -DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, , H1_4) +DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) #undef DO_ZZI_SHLL @@ -2289,7 +2174,7 @@ DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) -DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, , H1_4, DO_SHR) +DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) @@ -2297,7 +2182,7 @@ DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) -DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, , H1_4, do_urshr) +DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) @@ -2310,7 +2195,7 @@ DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) -DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, , H1_4, DO_SQSHRUN_D) +DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) @@ -2322,7 +2207,7 @@ DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) -DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRUN_D) +DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) @@ -2334,7 +2219,7 @@ DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) -DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, , H1_4, DO_SQSHRN_D) +DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) @@ -2346,7 +2231,7 @@ DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) -DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRN_D) +DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) @@ -2358,7 +2243,7 @@ DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) -DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQSHRN_D) +DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) @@ -2370,7 +2255,7 @@ DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) -DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQRSHRN_D) +DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) #undef DO_SHRNB #undef DO_SHRNT @@ -2408,7 +2293,7 @@ DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) -DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_ADDHN) +DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) @@ -2416,7 +2301,7 @@ DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) -DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_RADDHN) +DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) @@ -2424,7 +2309,7 @@ DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) -DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_SUBHN) +DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) @@ -2432,7 +2317,7 @@ DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) -DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_RSUBHN) +DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) #undef DO_RSUBHN #undef DO_SUBHN @@ -3040,7 +2925,7 @@ void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ DO_INSR(sve_insr_b, uint8_t, H1) DO_INSR(sve_insr_h, uint16_t, H1_2) DO_INSR(sve_insr_s, uint32_t, H1_4) -DO_INSR(sve_insr_d, uint64_t, ) +DO_INSR(sve_insr_d, uint64_t, H1_8) #undef DO_INSR @@ -3159,7 +3044,7 @@ void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ DO_TB(b, uint8_t, H1) DO_TB(h, uint16_t, H2) DO_TB(s, uint32_t, H4) -DO_TB(d, uint64_t, ) +DO_TB(d, uint64_t, H8) #undef DO_TB @@ -3180,11 +3065,11 @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) -DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4) +DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) -DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4) +DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) #undef DO_UNPK @@ -3519,7 +3404,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ DO_ZIP(sve_zip_b, uint8_t, H1) DO_ZIP(sve_zip_h, uint16_t, H1_2) DO_ZIP(sve_zip_s, uint32_t, H1_4) -DO_ZIP(sve_zip_d, uint64_t, ) +DO_ZIP(sve_zip_d, uint64_t, H1_8) DO_ZIP(sve2_zip_q, Int128, ) #define DO_UZP(NAME, TYPE, H) \ @@ -3548,7 +3433,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ DO_UZP(sve_uzp_b, uint8_t, H1) DO_UZP(sve_uzp_h, uint16_t, H1_2) DO_UZP(sve_uzp_s, uint32_t, H1_4) -DO_UZP(sve_uzp_d, uint64_t, ) +DO_UZP(sve_uzp_d, uint64_t, H1_8) DO_UZP(sve2_uzp_q, Int128, ) #define DO_TRN(NAME, TYPE, H) \ @@ -3571,7 +3456,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ DO_TRN(sve_trn_b, uint8_t, H1) DO_TRN(sve_trn_h, uint16_t, H1_2) DO_TRN(sve_trn_s, uint32_t, H1_4) -DO_TRN(sve_trn_d, uint64_t, ) +DO_TRN(sve_trn_d, uint64_t, H1_8) DO_TRN(sve2_trn_q, Int128, ) #undef DO_ZIP @@ -3766,7 +3651,7 @@ uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ - DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull) + DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) @@ -3911,7 +3796,7 @@ uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ - DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull) + DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) @@ -4331,24 +4216,24 @@ uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \ DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) -DO_REDUCE(sve_faddv_d, float64, , add, float64_zero) +DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) /* Identity is floatN_default_nan, without the function call. */ DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) -DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL) +DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) -DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL) +DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) -DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity) +DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) -DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity)) +DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) #undef DO_REDUCE @@ -4432,35 +4317,35 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) -DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add) +DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) -DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub) +DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) -DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul) +DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) -DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div) +DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) -DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min) +DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) -DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max) +DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) -DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum) +DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) -DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum) +DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) static inline float16 abd_h(float16 a, float16 b, float_status *s) { @@ -4479,7 +4364,7 @@ static inline float64 abd_d(float64 a, float64 b, float_status *s) DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) -DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d) +DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) { @@ -4489,11 +4374,11 @@ static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) -DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d) +DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) -DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd) +DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) #undef DO_ZPZZ_FP @@ -4521,15 +4406,15 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) -DO_ZPZS_FP(sve_fadds_d, float64, , float64_add) +DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) -DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub) +DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) -DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul) +DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) static inline float16 subr_h(float16 a, float16 b, float_status *s) { @@ -4548,23 +4433,23 @@ static inline float64 subr_d(float64 a, float64 b, float_status *s) DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) -DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d) +DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) -DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum) +DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) -DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum) +DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) -DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max) +DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) -DO_ZPZS_FP(sve_fmins_d, float64, , float64_min) +DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) /* Fully general two-operand expander, controlled by a predicate, * With the extra float_status parameter. @@ -4709,58 +4594,58 @@ static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) -DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16) -DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64) -DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32) -DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64) +DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) +DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) +DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) +DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) -DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz) -DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz) -DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd) -DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz) +DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) +DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) +DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) +DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) -DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz) -DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz) -DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd) -DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz) +DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) +DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) +DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) +DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) -DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd) +DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) -DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int) +DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) -DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64) +DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) -DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt) +DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) -DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64) -DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16) -DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32) -DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64) +DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) +DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) +DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) +DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) -DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64) -DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16) -DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32) -DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64) +DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) +DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) +DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) +DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) static int16_t do_float16_logb_as_int(float16 a, float_status *s) { @@ -4848,7 +4733,7 @@ static int64_t do_float64_logb_as_int(float64 a, float_status *s) DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) -DO_ZPZ_FP(flogb_d, float64, , do_float64_logb_as_int) +DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) #undef DO_ZPZ_FP @@ -5026,7 +4911,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ #define DO_FPCMP_PPZZ_S(NAME, OP) \ DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) #define DO_FPCMP_PPZZ_D(NAME, OP) \ - DO_FPCMP_PPZZ(NAME##_d, float64, , OP) + DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ DO_FPCMP_PPZZ_H(NAME, OP) \ @@ -5087,7 +4972,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, \ #define DO_FPCMP_PPZ0_S(NAME, OP) \ DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) #define DO_FPCMP_PPZ0_D(NAME, OP) \ - DO_FPCMP_PPZ0(NAME##_d, float64, , OP) + DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ DO_FPCMP_PPZ0_H(NAME, OP) \ @@ -5467,8 +5352,8 @@ DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t) DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t) DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t) DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t) -DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t) -DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t) +DO_LD_PRIM_1(ld1bdu, H1_8, uint64_t, uint8_t) +DO_LD_PRIM_1(ld1bds, H1_8, uint64_t, int8_t) #define DO_ST_PRIM_1(NAME, H, TE, TM) \ DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \ @@ -5477,7 +5362,7 @@ DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t) DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t) DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t) DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t) -DO_ST_PRIM_1(bd, , uint64_t, uint8_t) +DO_ST_PRIM_1(bd, H1_8, uint64_t, uint8_t) #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \ DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \ @@ -5494,22 +5379,22 @@ DO_ST_PRIM_1(bd, , uint64_t, uint8_t) DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw) DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw) DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw) -DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw) -DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw) +DO_LD_PRIM_2(hdu, H1_8, uint64_t, uint16_t, lduw) +DO_LD_PRIM_2(hds, H1_8, uint64_t, int16_t, lduw) DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw) DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw) -DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw) +DO_ST_PRIM_2(hd, H1_8, uint64_t, uint16_t, stw) DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl) -DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl) -DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl) +DO_LD_PRIM_2(sdu, H1_8, uint64_t, uint32_t, ldl) +DO_LD_PRIM_2(sds, H1_8, uint64_t, int32_t, ldl) DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl) -DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl) +DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl) -DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq) -DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq) +DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq) +DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq) #undef DO_LD_TLB #undef DO_ST_TLB @@ -7743,7 +7628,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) -DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, , H1_4, float64_to_float32) +DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ @@ -7763,7 +7648,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ } DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) -DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, , H1_4, float32_to_float64) +DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) #undef DO_FCVTLT #undef DO_FCVTNT diff --git a/target/arm/t32.decode b/target/arm/t32.decode index 8b2c487..0f9326c 100644 --- a/target/arm/t32.decode +++ b/target/arm/t32.decode @@ -671,8 +671,17 @@ BL 1111 0. .......... 11.1 ............ @branch24 # LE and WLS immediate %lob_imm 1:10 11:1 !function=times_2 - DLS 1111 0 0000 100 rn:4 1110 0000 0000 0001 - WLS 1111 0 0000 100 rn:4 1100 . .......... 1 imm=%lob_imm - LE 1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm + DLS 1111 0 0000 100 rn:4 1110 0000 0000 0001 size=4 + WLS 1111 0 0000 100 rn:4 1100 . .......... 1 imm=%lob_imm size=4 + { + LE 1111 0 0000 0 f:1 tp:1 1111 1100 . .......... 1 imm=%lob_imm + # This is WLSTP + WLS 1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm + } + { + LCTP 1111 0 0000 000 1111 1110 0000 0000 0001 + # This is DLSTP + DLS 1111 0 0000 0 size:2 rn:4 1110 0000 0000 0001 + } ] } diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h index c997f4e..0a00539 100644 --- a/target/arm/translate-a32.h +++ b/target/arm/translate-a32.h @@ -22,6 +22,7 @@ /* Prototypes for autogenerated disassembler functions */ bool disas_m_nocp(DisasContext *dc, uint32_t insn); +bool disas_mve(DisasContext *dc, uint32_t insn); bool disas_vfp(DisasContext *s, uint32_t insn); bool disas_vfp_uncond(DisasContext *s, uint32_t insn); bool disas_neon_dp(DisasContext *s, uint32_t insn); @@ -44,6 +45,7 @@ long vfp_reg_offset(bool dp, unsigned reg); long neon_full_reg_offset(unsigned reg); long neon_element_offset(int reg, int element, MemOp memop); void gen_rev16(TCGv_i32 dest, TCGv_i32 var); +void clear_eci_state(DisasContext *s); static inline TCGv_i32 load_cpu_offset(int offset) { diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index 8713dfe..7f74d0e 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -8291,7 +8291,6 @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn) } break; default: - fprintf(stderr, "%s: cmode_3_1: %x\n", __func__, cmode_3_1); g_assert_not_reached(); } @@ -11990,38 +11989,46 @@ static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn) */ static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn) { - int opcode, fpopcode; - int is_q, u, a, rm, rn, rd; - int datasize, elements; - int pass; - TCGv_ptr fpst; - bool pairwise = false; - - if (!dc_isar_feature(aa64_fp16, s)) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - /* For these floating point ops, the U, a and opcode bits + int opcode = extract32(insn, 11, 3); + int u = extract32(insn, 29, 1); + int a = extract32(insn, 23, 1); + int is_q = extract32(insn, 30, 1); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + /* + * For these floating point ops, the U, a and opcode bits * together indicate the operation. */ - opcode = extract32(insn, 11, 3); - u = extract32(insn, 29, 1); - a = extract32(insn, 23, 1); - is_q = extract32(insn, 30, 1); - rm = extract32(insn, 16, 5); - rn = extract32(insn, 5, 5); - rd = extract32(insn, 0, 5); - - fpopcode = opcode | (a << 3) | (u << 4); - datasize = is_q ? 128 : 64; - elements = datasize / 16; + int fpopcode = opcode | (a << 3) | (u << 4); + int datasize = is_q ? 128 : 64; + int elements = datasize / 16; + bool pairwise; + TCGv_ptr fpst; + int pass; switch (fpopcode) { + case 0x0: /* FMAXNM */ + case 0x1: /* FMLA */ + case 0x2: /* FADD */ + case 0x3: /* FMULX */ + case 0x4: /* FCMEQ */ + case 0x6: /* FMAX */ + case 0x7: /* FRECPS */ + case 0x8: /* FMINNM */ + case 0x9: /* FMLS */ + case 0xa: /* FSUB */ + case 0xe: /* FMIN */ + case 0xf: /* FRSQRTS */ + case 0x13: /* FMUL */ + case 0x14: /* FCMGE */ + case 0x15: /* FACGE */ + case 0x17: /* FDIV */ + case 0x1a: /* FABD */ + case 0x1c: /* FCMGT */ + case 0x1d: /* FACGT */ + pairwise = false; + break; case 0x10: /* FMAXNMP */ case 0x12: /* FADDP */ case 0x16: /* FMAXP */ @@ -12029,6 +12036,18 @@ static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn) case 0x1e: /* FMINP */ pairwise = true; break; + default: + unallocated_encoding(s); + return; + } + + if (!dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; } fpst = fpstatus_ptr(FPST_FPCR_F16); @@ -12153,8 +12172,6 @@ static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn) gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst); break; default: - fprintf(stderr, "%s: insn 0x%04x, fpop 0x%2x @ 0x%" PRIx64 "\n", - __func__, insn, fpopcode, s->pc_curr); g_assert_not_reached(); } @@ -13234,8 +13251,8 @@ static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn) case 0x7f: /* FSQRT (vector) */ break; default: - fprintf(stderr, "%s: insn 0x%04x fpop 0x%2x\n", __func__, insn, fpop); - g_assert_not_reached(); + unallocated_encoding(s); + return; } diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c index d47eb8e..09b3be4 100644 --- a/target/arm/translate-m-nocp.c +++ b/target/arm/translate-m-nocp.c @@ -75,8 +75,12 @@ static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a) unallocated_encoding(s); return true; } + + s->eci_handled = true; + /* If no fpu, NOP. */ if (!dc_isar_feature(aa32_vfp, s)) { + clear_eci_state(s); return true; } @@ -88,6 +92,8 @@ static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a) } tcg_temp_free_i32(fptr); + clear_eci_state(s); + /* End the TB, because we have updated FP control bits */ s->base.is_jmp = DISAS_UPDATE_EXIT; return true; @@ -110,8 +116,11 @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a) return true; } + s->eci_handled = true; + if (!dc_isar_feature(aa32_vfp_simd, s)) { /* NOP if we have neither FP nor MVE */ + clear_eci_state(s); return true; } @@ -173,7 +182,12 @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a) btmreg++; } assert(btmreg == topreg + 1); - /* TODO: when MVE is implemented, zero VPR here */ + if (dc_isar_feature(aa32_mve, s)) { + TCGv_i32 z32 = tcg_const_i32(0); + store_cpu_field(z32, v7m.vpr); + } + + clear_eci_state(s); return true; } diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c new file mode 100644 index 0000000..e91f526 --- /dev/null +++ b/target/arm/translate-mve.c @@ -0,0 +1,29 @@ +/* + * ARM translation: M-profile MVE instructions + * + * Copyright (c) 2021 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "exec/exec-all.h" +#include "exec/gen-icount.h" +#include "translate.h" +#include "translate-a32.h" + +/* Include the generated decoder */ +#include "decode-mve.c.inc" diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c index d01e465..01e26a2 100644 --- a/target/arm/translate-vfp.c +++ b/target/arm/translate-vfp.c @@ -143,11 +143,21 @@ static void gen_preserve_fp_state(DisasContext *s) static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) { if (s->fp_excp_el) { - /* M-profile handled this earlier, in disas_m_nocp() */ - assert (!arm_dc_feature(s, ARM_FEATURE_M)); - gen_exception_insn(s, s->pc_curr, EXCP_UDEF, - syn_fp_access_trap(1, 0xe, false), - s->fp_excp_el); + if (arm_dc_feature(s, ARM_FEATURE_M)) { + /* + * M-profile mostly catches the "FPU disabled" case early, in + * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP) + * which do coprocessor-checks are outside the large ranges of + * the encoding space handled by the patterns in m-nocp.decode, + * and for them we may need to raise NOCP here. + */ + gen_exception_insn(s, s->pc_curr, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + } else { + gen_exception_insn(s, s->pc_curr, EXCP_UDEF, + syn_fp_access_trap(1, 0xe, false), + s->fp_excp_el); + } return false; } @@ -180,8 +190,8 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) if (s->v7m_new_fp_ctxt_needed) { /* - * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA - * and the FPSCR. + * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA, + * the FPSCR, and VPR. */ TCGv_i32 control, fpscr; uint32_t bits = R_V7M_CONTROL_FPCA_MASK; @@ -189,6 +199,11 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]); gen_helper_vfp_set_fpscr(cpu_env, fpscr); tcg_temp_free_i32(fpscr); + if (dc_isar_feature(aa32_mve, s)) { + TCGv_i32 z32 = tcg_const_i32(0); + store_cpu_field(z32, v7m.vpr); + } + /* * We don't need to arrange to end the TB, because the only * parts of FPSCR which we cache in the TB flags are the VECLEN @@ -784,10 +799,17 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, { TCGv_i32 fpscr; tmp = loadfn(s, opaque); - /* - * TODO: when we implement MVE, write the QC bit. - * For non-MVE, QC is RES0. - */ + if (dc_isar_feature(aa32_mve, s)) { + /* QC is only present for MVE; otherwise RES0 */ + TCGv_i32 qc = tcg_temp_new_i32(); + tcg_gen_andi_i32(qc, tmp, FPCR_QC); + /* + * The 4 vfp.qc[] fields need only be "zero" vs "non-zero"; + * here writing the same value into all elements is simplest. + */ + tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc), + 16, 16, qc); + } tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK); @@ -869,6 +891,11 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, break; } + if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) { + /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */ + regno = QEMU_VFP_FPSCR_NZCV; + } + switch (regno) { case ARM_VFP_FPSCR: tmp = tcg_temp_new_i32(); @@ -876,11 +903,11 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, storefn(s, opaque, tmp); break; case ARM_VFP_FPSCR_NZCVQC: - /* - * TODO: MVE has a QC bit, which we probably won't store - * in the xregs[] field. For non-MVE, where QC is RES0, - * we can just fall through to the FPSCR_NZCV case. - */ + tmp = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK); + storefn(s, opaque, tmp); + break; case QEMU_VFP_FPSCR_NZCV: /* * Read just NZCV; this is a special case to avoid the @@ -1545,6 +1572,8 @@ static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a) return false; } + s->eci_handled = true; + if (!vfp_access_check(s)) { return true; } @@ -1594,6 +1623,7 @@ static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a) tcg_temp_free_i32(addr); } + clear_eci_state(s); return true; } @@ -1628,6 +1658,8 @@ static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a) return false; } + s->eci_handled = true; + if (!vfp_access_check(s)) { return true; } @@ -1684,6 +1716,7 @@ static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a) tcg_temp_free_i32(addr); } + clear_eci_state(s); return true; } diff --git a/target/arm/translate.c b/target/arm/translate.c index 8e0e55c..9e2cca7 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -309,6 +309,19 @@ static inline bool is_singlestepping(DisasContext *s) return s->base.singlestep_enabled || s->ss_active; } +void clear_eci_state(DisasContext *s) +{ + /* + * Clear any ECI/ICI state: used when a load multiple/store + * multiple insn executes. + */ + if (s->eci) { + TCGv_i32 tmp = tcg_const_i32(0); + store_cpu_field(tmp, condexec_bits); + s->eci = 0; + } +} + static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b) { TCGv_i32 tmp1 = tcg_temp_new_i32(); @@ -6203,6 +6216,8 @@ static bool trans_BKPT(DisasContext *s, arg_BKPT *a) if (!ENABLE_ARCH_5) { return false; } + /* BKPT is OK with ECI set and leaves it untouched */ + s->eci_handled = true; if (arm_dc_feature(s, ARM_FEATURE_M) && semihosting_enabled() && #ifndef CONFIG_USER_ONLY @@ -7767,6 +7782,8 @@ static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n) return true; } + s->eci_handled = true; + addr = op_addr_block_pre(s, a, n); mem_idx = get_mem_index(s); @@ -7793,6 +7810,7 @@ static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n) } op_addr_block_post(s, a, addr, n); + clear_eci_state(s); return true; } @@ -7847,6 +7865,8 @@ static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n) return true; } + s->eci_handled = true; + addr = op_addr_block_pre(s, a, n); mem_idx = get_mem_index(s); loaded_base = false; @@ -7897,6 +7917,7 @@ static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n) /* Must exit loop to check un-masked IRQs */ s->base.is_jmp = DISAS_EXIT; } + clear_eci_state(s); return true; } @@ -7952,6 +7973,8 @@ static bool trans_CLRM(DisasContext *s, arg_CLRM *a) return false; } + s->eci_handled = true; + zero = tcg_const_i32(0); for (i = 0; i < 15; i++) { if (extract32(a->list, i, 1)) { @@ -7969,6 +7992,7 @@ static bool trans_CLRM(DisasContext *s, arg_CLRM *a) tcg_temp_free_i32(maskreg); } tcg_temp_free_i32(zero); + clear_eci_state(s); return true; } @@ -8090,13 +8114,32 @@ static bool trans_DLS(DisasContext *s, arg_DLS *a) return false; } if (a->rn == 13 || a->rn == 15) { - /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */ + /* + * For DLSTP rn == 15 is a related encoding (LCTP); the + * other cases caught by this condition are all + * CONSTRAINED UNPREDICTABLE: we choose to UNDEF + */ return false; } - /* Not a while loop, no tail predication: just set LR to the count */ + if (a->size != 4) { + /* DLSTP */ + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + if (!vfp_access_check(s)) { + return true; + } + } + + /* Not a while loop: set LR to the count, and set LTPSIZE for DLSTP */ tmp = load_reg(s, a->rn); store_reg(s, 14, tmp); + if (a->size != 4) { + /* DLSTP: set FPSCR.LTPSIZE */ + tmp = tcg_const_i32(a->size); + store_cpu_field(tmp, v7m.ltpsize); + } return true; } @@ -8110,7 +8153,11 @@ static bool trans_WLS(DisasContext *s, arg_WLS *a) return false; } if (a->rn == 13 || a->rn == 15) { - /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */ + /* + * For WLSTP rn == 15 is a related encoding (LE); the + * other cases caught by this condition are all + * CONSTRAINED UNPREDICTABLE: we choose to UNDEF + */ return false; } if (s->condexec_mask) { @@ -8123,10 +8170,41 @@ static bool trans_WLS(DisasContext *s, arg_WLS *a) */ return false; } + if (a->size != 4) { + /* WLSTP */ + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + /* + * We need to check that the FPU is enabled here, but mustn't + * call vfp_access_check() to do that because we don't want to + * do the lazy state preservation in the "loop count is zero" case. + * Do the check-and-raise-exception by hand. + */ + if (s->fp_excp_el) { + gen_exception_insn(s, s->pc_curr, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + return true; + } + } + nextlabel = gen_new_label(); tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_R[a->rn], 0, nextlabel); tmp = load_reg(s, a->rn); store_reg(s, 14, tmp); + if (a->size != 4) { + /* + * WLSTP: set FPSCR.LTPSIZE. This requires that we do the + * lazy state preservation, new FP context creation, etc, + * that vfp_access_check() does. We know that the actual + * access check will succeed (ie it won't generate code that + * throws an exception) because we did that check by hand earlier. + */ + bool ok = vfp_access_check(s); + assert(ok); + tmp = tcg_const_i32(a->size); + store_cpu_field(tmp, v7m.ltpsize); + } gen_jmp_tb(s, s->base.pc_next, 1); gen_set_label(nextlabel); @@ -8145,25 +8223,140 @@ static bool trans_LE(DisasContext *s, arg_LE *a) * any faster. */ TCGv_i32 tmp; + TCGLabel *loopend; + bool fpu_active; if (!dc_isar_feature(aa32_lob, s)) { return false; } + if (a->f && a->tp) { + return false; + } + if (s->condexec_mask) { + /* + * LE in an IT block is CONSTRAINED UNPREDICTABLE; + * we choose to UNDEF, because otherwise our use of + * gen_goto_tb(1) would clash with the use of TB exit 1 + * in the dc->condjmp condition-failed codepath in + * arm_tr_tb_stop() and we'd get an assertion. + */ + return false; + } + if (a->tp) { + /* LETP */ + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + if (!vfp_access_check(s)) { + s->eci_handled = true; + return true; + } + } - if (!a->f) { - /* Not loop-forever. If LR <= 1 this is the last loop: do nothing. */ - arm_gen_condlabel(s); - tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, s->condlabel); - /* Decrement LR */ - tmp = load_reg(s, 14); - tcg_gen_addi_i32(tmp, tmp, -1); - store_reg(s, 14, tmp); + /* LE/LETP is OK with ECI set and leaves it untouched */ + s->eci_handled = true; + + /* + * With MVE, LTPSIZE might not be 4, and we must emit an INVSTATE + * UsageFault exception for the LE insn in that case. Note that we + * are not directly checking FPSCR.LTPSIZE but instead check the + * pseudocode LTPSIZE() function, which returns 4 if the FPU is + * not currently active (ie ActiveFPState() returns false). We + * can identify not-active purely from our TB state flags, as the + * FPU is active only if: + * the FPU is enabled + * AND lazy state preservation is not active + * AND we do not need a new fp context (this is the ASPEN/FPCA check) + * + * Usually we don't need to care about this distinction between + * LTPSIZE and FPSCR.LTPSIZE, because the code in vfp_access_check() + * will either take an exception or clear the conditions that make + * the FPU not active. But LE is an unusual case of a non-FP insn + * that looks at LTPSIZE. + */ + fpu_active = !s->fp_excp_el && !s->v7m_lspact && !s->v7m_new_fp_ctxt_needed; + + if (!a->tp && dc_isar_feature(aa32_mve, s) && fpu_active) { + /* Need to do a runtime check for LTPSIZE != 4 */ + TCGLabel *skipexc = gen_new_label(); + tmp = load_cpu_field(v7m.ltpsize); + tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 4, skipexc); + tcg_temp_free_i32(tmp); + gen_exception_insn(s, s->pc_curr, EXCP_INVSTATE, syn_uncategorized(), + default_exception_el(s)); + gen_set_label(skipexc); + } + + if (a->f) { + /* Loop-forever: just jump back to the loop start */ + gen_jmp(s, read_pc(s) - a->imm); + return true; + } + + /* + * Not loop-forever. If LR <= loop-decrement-value this is the last loop. + * For LE, we know at this point that LTPSIZE must be 4 and the + * loop decrement value is 1. For LETP we need to calculate the decrement + * value from LTPSIZE. + */ + loopend = gen_new_label(); + if (!a->tp) { + tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, loopend); + tcg_gen_addi_i32(cpu_R[14], cpu_R[14], -1); + } else { + /* + * Decrement by 1 << (4 - LTPSIZE). We need to use a TCG local + * so that decr stays live after the brcondi. + */ + TCGv_i32 decr = tcg_temp_local_new_i32(); + TCGv_i32 ltpsize = load_cpu_field(v7m.ltpsize); + tcg_gen_sub_i32(decr, tcg_constant_i32(4), ltpsize); + tcg_gen_shl_i32(decr, tcg_constant_i32(1), decr); + tcg_temp_free_i32(ltpsize); + + tcg_gen_brcond_i32(TCG_COND_LEU, cpu_R[14], decr, loopend); + + tcg_gen_sub_i32(cpu_R[14], cpu_R[14], decr); + tcg_temp_free_i32(decr); } /* Jump back to the loop start */ gen_jmp(s, read_pc(s) - a->imm); + + gen_set_label(loopend); + if (a->tp) { + /* Exits from tail-pred loops must reset LTPSIZE to 4 */ + tmp = tcg_const_i32(4); + store_cpu_field(tmp, v7m.ltpsize); + } + /* End TB, continuing to following insn */ + gen_jmp_tb(s, s->base.pc_next, 1); return true; } +static bool trans_LCTP(DisasContext *s, arg_LCTP *a) +{ + /* + * M-profile Loop Clear with Tail Predication. Since our implementation + * doesn't cache branch information, all we need to do is reset + * FPSCR.LTPSIZE to 4. + */ + TCGv_i32 ltpsize; + + if (!dc_isar_feature(aa32_lob, s) || + !dc_isar_feature(aa32_mve, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + ltpsize = tcg_const_i32(4); + store_cpu_field(ltpsize, v7m.ltpsize); + return true; +} + + static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half) { TCGv_i32 addr, tmp; @@ -8726,6 +8919,7 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn) if (disas_t32(s, insn) || disas_vfp_uncond(s, insn) || disas_neon_shared(s, insn) || + disas_mve(s, insn) || ((insn >> 28) == 0xe && disas_vfp(s, insn))) { return; } @@ -8775,8 +8969,28 @@ static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs) dc->thumb = EX_TBFLAG_AM32(tb_flags, THUMB); dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE; condexec = EX_TBFLAG_AM32(tb_flags, CONDEXEC); - dc->condexec_mask = (condexec & 0xf) << 1; - dc->condexec_cond = condexec >> 4; + /* + * the CONDEXEC TB flags are CPSR bits [15:10][26:25]. On A-profile this + * is always the IT bits. On M-profile, some of the reserved encodings + * of IT are used instead to indicate either ICI or ECI, which + * indicate partial progress of a restartable insn that was interrupted + * partway through by an exception: + * * if CONDEXEC[3:0] != 0b0000 : CONDEXEC is IT bits + * * if CONDEXEC[3:0] == 0b0000 : CONDEXEC is ICI or ECI bits + * In all cases CONDEXEC == 0 means "not in IT block or restartable + * insn, behave normally". + */ + dc->eci = dc->condexec_mask = dc->condexec_cond = 0; + dc->eci_handled = false; + dc->insn_eci_rewind = NULL; + if (condexec & 0xf) { + dc->condexec_mask = (condexec & 0xf) << 1; + dc->condexec_cond = condexec >> 4; + } else { + if (arm_feature(env, ARM_FEATURE_M)) { + dc->eci = condexec >> 4; + } + } core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX); dc->mmu_idx = core_to_arm_mmu_idx(env, core_mmu_idx); @@ -8898,10 +9112,19 @@ static void arm_tr_tb_start(DisasContextBase *dcbase, CPUState *cpu) static void arm_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu) { DisasContext *dc = container_of(dcbase, DisasContext, base); + /* + * The ECI/ICI bits share PSR bits with the IT bits, so we + * need to reconstitute the bits from the split-out DisasContext + * fields here. + */ + uint32_t condexec_bits; - tcg_gen_insn_start(dc->base.pc_next, - (dc->condexec_cond << 4) | (dc->condexec_mask >> 1), - 0); + if (dc->eci) { + condexec_bits = dc->eci << 4; + } else { + condexec_bits = (dc->condexec_cond << 4) | (dc->condexec_mask >> 1); + } + tcg_gen_insn_start(dc->base.pc_next, condexec_bits, 0); dc->insn_start = tcg_last_op(); } @@ -9067,6 +9290,40 @@ static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) } dc->insn = insn; + if (dc->eci) { + /* + * For M-profile continuable instructions, ECI/ICI handling + * falls into these cases: + * - interrupt-continuable instructions + * These are the various load/store multiple insns (both + * integer and fp). The ICI bits indicate the register + * where the load/store can resume. We make the IMPDEF + * choice to always do "instruction restart", ie ignore + * the ICI value and always execute the ldm/stm from the + * start. So all we need to do is zero PSR.ICI if the + * insn executes. + * - MVE instructions subject to beat-wise execution + * Here the ECI bits indicate which beats have already been + * executed, and we must honour this. Each insn of this + * type will handle it correctly. We will update PSR.ECI + * in the helper function for the insn (some ECI values + * mean that the following insn also has been partially + * executed). + * - Special cases which don't advance ECI + * The insns LE, LETP and BKPT leave the ECI/ICI state + * bits untouched. + * - all other insns (the common case) + * Non-zero ECI/ICI means an INVSTATE UsageFault. + * We place a rewind-marker here. Insns in the previous + * three categories will set a flag in the DisasContext. + * If the flag isn't set after we call disas_thumb_insn() + * or disas_thumb2_insn() then we know we have a "some other + * insn" case. We will rewind to the marker (ie throwing away + * all the generated code) and instead emit "take exception". + */ + dc->insn_eci_rewind = tcg_last_op(); + } + if (dc->condexec_mask && !thumb_insn_is_unconditional(dc, insn)) { uint32_t cond = dc->condexec_cond; @@ -9095,6 +9352,17 @@ static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) } } + if (dc->eci && !dc->eci_handled) { + /* + * Insn wasn't valid for ECI/ICI at all: undo what we + * just generated and instead emit an exception + */ + tcg_remove_ops_after(dc->insn_eci_rewind); + dc->condjmp = 0; + gen_exception_insn(dc, dc->pc_curr, EXCP_INVSTATE, syn_uncategorized(), + default_exception_el(dc)); + } + arm_post_translate_insn(dc); /* Thumb is a variable-length ISA. Stop translation when the next insn diff --git a/target/arm/translate.h b/target/arm/translate.h index 12c28b0..2821b32 100644 --- a/target/arm/translate.h +++ b/target/arm/translate.h @@ -21,6 +21,15 @@ typedef struct DisasContext { /* Thumb-2 conditional execution bits. */ int condexec_mask; int condexec_cond; + /* M-profile ECI/ICI exception-continuable instruction state */ + int eci; + /* + * trans_ functions for insns which are continuable should set this true + * after decode (ie after any UNDEF checks) + */ + bool eci_handled; + /* TCG op to rewind to if this turns out to be an invalid ECI state */ + TCGOp *insn_eci_rewind; int thumb; int sctlr_b; MemOp be_data; diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c index 5862f18..034f6b8 100644 --- a/target/arm/vec_helper.c +++ b/target/arm/vec_helper.c @@ -25,6 +25,108 @@ #include "qemu/int128.h" #include "vec_internal.h" +/* + * Data for expanding active predicate bits to bytes, for byte elements. + * + * for (i = 0; i < 256; ++i) { + * unsigned long m = 0; + * for (j = 0; j < 8; j++) { + * if ((i >> j) & 1) { + * m |= 0xfful << (j << 3); + * } + * } + * printf("0x%016lx,\n", m); + * } + */ +const uint64_t expand_pred_b_data[256] = { + 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, + 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, + 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, + 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, + 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, + 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, + 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, + 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, + 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, + 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, + 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, + 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, + 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, + 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, + 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, + 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, + 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, + 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, + 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, + 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, + 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, + 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, + 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, + 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, + 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, + 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, + 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, + 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, + 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, + 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, + 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, + 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, + 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, + 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, + 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, + 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, + 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, + 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, + 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, + 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, + 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, + 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, + 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, + 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, + 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, + 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, + 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, + 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, + 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, + 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, + 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, + 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, + 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, + 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, + 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, + 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, + 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, + 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, + 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, + 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, + 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, + 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, + 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, + 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, + 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, + 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, + 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, + 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, + 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, + 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, + 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, + 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, + 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, + 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, + 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, + 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, + 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, + 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, + 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, + 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, + 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, + 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, + 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, + 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, + 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, + 0xffffffffffffffff, +}; + /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, bool neg, bool round) @@ -589,8 +691,8 @@ DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) -DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, ) -DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, ) +DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) +DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, void *vfpst, uint32_t desc) @@ -1226,7 +1328,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) -DO_MUL_IDX(gvec_mul_idx_d, uint64_t, ) +DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) #undef DO_MUL_IDX @@ -1248,11 +1350,11 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) -DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, ) +DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) -DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, ) +DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) #undef DO_MLA_IDX @@ -1279,7 +1381,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) -DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, ) +DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8) /* * Non-fused multiply-accumulate operations, for Neon. NB that unlike @@ -1317,7 +1419,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) -DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) +DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) #undef DO_FMLA_IDX diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h index dba481e..865d213 100644 --- a/target/arm/vec_internal.h +++ b/target/arm/vec_internal.h @@ -42,7 +42,16 @@ #define H2(x) (x) #define H4(x) (x) #endif +/* + * Access to 64-bit elements isn't host-endian dependent; we provide H8 + * and H1_8 so that when a function is being generated from a macro we + * can pass these rather than an empty macro argument, for clarity. + */ +#define H8(x) (x) +#define H1_8(x) (x) +/* Data for expanding active predicate bits to bytes, for byte elements. */ +extern const uint64_t expand_pred_b_data[256]; static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) { diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c index 496f003..8a71660 100644 --- a/target/arm/vfp_helper.c +++ b/target/arm/vfp_helper.c @@ -220,7 +220,8 @@ void HELPER(vfp_set_fpscr)(CPUARMState *env, uint32_t val) FPCR_LTPSIZE_LENGTH); } - if (arm_feature(env, ARM_FEATURE_NEON)) { + if (arm_feature(env, ARM_FEATURE_NEON) || + cpu_isar_feature(aa32_mve, cpu)) { /* * The bit we set within fpscr_q is arbitrary; the register as a * whole being zero/non-zero is what counts. diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target index 928357b..2c05c90 100644 --- a/tests/tcg/aarch64/Makefile.target +++ b/tests/tcg/aarch64/Makefile.target @@ -37,7 +37,7 @@ AARCH64_TESTS += bti-2 # MTE Tests ifneq ($(DOCKER_IMAGE)$(CROSS_CC_HAS_ARMV8_MTE),) -AARCH64_TESTS += mte-1 mte-2 mte-3 mte-4 mte-5 mte-6 +AARCH64_TESTS += mte-1 mte-2 mte-3 mte-4 mte-5 mte-6 mte-7 mte-%: CFLAGS += -march=armv8.5-a+memtag endif diff --git a/tests/tcg/aarch64/mte-7.c b/tests/tcg/aarch64/mte-7.c new file mode 100644 index 0000000..a981de6 --- /dev/null +++ b/tests/tcg/aarch64/mte-7.c @@ -0,0 +1,31 @@ +/* + * Memory tagging, unaligned access crossing pages. + * https://gitlab.com/qemu-project/qemu/-/issues/403 + * + * Copyright (c) 2021 Linaro Ltd + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "mte.h" + +int main(int ac, char **av) +{ + void *p; + + enable_mte(PR_MTE_TCF_SYNC); + p = alloc_mte_mem(2 * 0x1000); + + /* Tag the pointer. */ + p = (void *)((unsigned long)p | (1ul << 56)); + + /* Store tag in sequential granules. */ + asm("stg %0, [%0]" : : "r"(p + 0x0ff0)); + asm("stg %0, [%0]" : : "r"(p + 0x1000)); + + /* + * Perform an unaligned store with tag 1 crossing the pages. + * Failure dies with SIGSEGV. + */ + asm("str %0, [%0]" : : "r"(p + 0x0ffc)); + return 0; +} |