diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2024-09-06 13:59:37 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2024-09-06 13:59:37 +0100 |
commit | ec08d9a51e6af3cd3edbdbf2ca6e97a1e2b5f0d1 (patch) | |
tree | 8ff6f0a345136b2a696e4c384c314d51e8a4b9e8 | |
parent | bdb468294135bf259ed0281d13b0ef5d989e1c9a (diff) | |
parent | 99ec7b440a1d6a6ef07450b68687d24d13a25fb5 (diff) | |
download | qemu-ec08d9a51e6af3cd3edbdbf2ca6e97a1e2b5f0d1.zip qemu-ec08d9a51e6af3cd3edbdbf2ca6e97a1e2b5f0d1.tar.gz qemu-ec08d9a51e6af3cd3edbdbf2ca6e97a1e2b5f0d1.tar.bz2 |
Merge tag 'pull-target-arm-20240905' of https://git.linaro.org/people/pmaydell/qemu-arm into staging
target-arm queue:
* Implement FEAT_EBF16 emulation
* accel/tcg: Remove dead code from rr_cpu_thread_fn()
* hw: add compat machines for 9.2
* virt: default to two-stage SMMU from virt-9.2
* sbsa-ref: use two-stage SMMU
* hw: Various minor memory leak fixes
* target/arm: Correct names of VFP VFNMA and VFNMS insns
* hw/arm/xilinx_zynq: Enable Security Extensions
* hw/arm/boot: Report error msg if loading elf/dtb failed
# -----BEGIN PGP SIGNATURE-----
#
# iQJNBAABCAA3FiEE4aXFk81BneKOgxXPPCUl7RQ2DN4FAmbZqzEZHHBldGVyLm1h
# eWRlbGxAbGluYXJvLm9yZwAKCRA8JSXtFDYM3lJ7D/9s/ZTkiCj/z+caHotwNJVt
# ECgEEVinitwZxSMINZd1f6bxTY8hYVjMewj6A6RvHtMJMr7SUOmL8wi0YlbhTm44
# jb8dZVf3pzPaZ399jxOeGnFipGyKmK0XM5rKc7CP6yJUS3B9RkUbLEHng8Q0ZBtl
# cnZqI12jJBdtHU8D4JIvBgM2N2ay4bKY8EQEPCv4S7ZTKawWcKgSR5pMd2TBIqIT
# 0gaDL3eOgCt2XWIrMzRjvaJK70obN/+n+vZQskJ/sIDsw+Kz8sZGlivdBXLRmQ+A
# OUgtdyZoD42Q8KtwM0bjoaoxz6VMNPJp5khB45EPjVgWyeyJ0L6ZcWCX7nT4hZsi
# 1C0NJaJU6HQbfsPiMIGxgHYJCbQue/mVBE02MPhmN8fZlsTRKWT9Miu67S0PI5Ib
# ZWo88Ew1coucBm25K2NWdoR3dCP8EFnxqL556L8M4iDWYQ/djf8cpFAN9QJBFrNw
# CaXS+vxIFUjZ6TSjf8gOYPAONmAg5DsCucgyO4MBKnvlY5h2J+GTq/FC+kWzL9jE
# UfhqOWSP34ol2lg319zOtKg4Ga+GOivo2DmgWQhDwZ2rmRR+xgN8rkQjpJKIT5Zj
# Ji+ucJrghBZ0sN622QYG0u0Ap9Jy4KCOxcFfS1b4gNhmMDWg27Tx9tIguXmjOE3M
# aAs4wmm4Nz4kpsf1KkB11Q==
# =gZuf
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 05 Sep 2024 13:59:29 BST
# gpg: using RSA key E1A5C593CD419DE28E8315CF3C2525ED14360CDE
# gpg: issuer "peter.maydell@linaro.org"
# gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" [ultimate]
# gpg: aka "Peter Maydell <pmaydell@gmail.com>" [ultimate]
# gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" [ultimate]
# gpg: aka "Peter Maydell <peter@archaic.org.uk>" [ultimate]
# Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE
* tag 'pull-target-arm-20240905' of https://git.linaro.org/people/pmaydell/qemu-arm: (25 commits)
platform-bus: fix refcount leak
hw/arm/boot: Explain why load_elf_hdr() error is ignored
hw/arm/boot: Report error msg if loading elf/dtb failed
hw/arm/xilinx_zynq: Enable Security Extensions
target/arm: Correct names of VFP VFNMA and VFNMS insns
hw/arm/sbsa-ref: Don't leak string in sbsa_fdt_add_gic_node()
hm/nvram/xlnx-versal-efuse-ctrl: Call register_finalize_block
hw/misc/xlnx-versal-trng: Call register_finalize_block
hw/nvram/xlnx-zynqmp-efuse: Call register_finalize_block
hw/nvram/xlnx-bbram: Call register_finalize_block
hw/misc/xlnx-versal-trng: Free s->prng in finalize, not unrealize
hw/misc/xlnx-versal-cfu: destroy fifo in finalize
hw/arm/sbsa-ref: Use two-stage SMMU
hw/arm/virt: Default to two-stage SMMU from virt-9.2
hw/arm/smmuv3: Update comment documenting "stage" property
hw: add compat machines for 9.2
accel/tcg: Remove dead code from rr_cpu_thread_fn()
target/arm: Enable FEAT_EBF16 in the "max" CPU
target/arm: Implement FPCR.EBF=1 semantics for bfdotadd()
target/arm: Prepare bfdotadd() callers for FEAT_EBF support
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
42 files changed, 530 insertions, 155 deletions
diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c index c59c77d..8ebadf8 100644 --- a/accel/tcg/tcg-accel-ops-rr.c +++ b/accel/tcg/tcg-accel-ops-rr.c @@ -302,9 +302,7 @@ static void *rr_cpu_thread_fn(void *arg) rr_deal_with_unplugged_cpus(); } - rcu_remove_force_rcu_notifier(&force_rcu); - rcu_unregister_thread(); - return NULL; + g_assert_not_reached(); } void rr_start_vcpu_thread(CPUState *cpu) diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst index 3ab6e72..35f52a5 100644 --- a/docs/system/arm/emulation.rst +++ b/docs/system/arm/emulation.rst @@ -45,6 +45,7 @@ the following architecture extensions: - FEAT_DotProd (Advanced SIMD dot product instructions) - FEAT_DoubleFault (Double Fault Extension) - FEAT_E0PD (Preventing EL0 access to halves of address maps) +- FEAT_EBF16 (AArch64 Extended BFloat16 instructions) - FEAT_ECV (Enhanced Counter Virtualization) - FEAT_EL0 (Support for execution at EL0) - FEAT_EL1 (Support for execution at EL1) diff --git a/hw/arm/boot.c b/hw/arm/boot.c index d480a7d..5301d8d 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -799,14 +799,18 @@ static ssize_t arm_load_elf(struct arm_boot_info *info, uint64_t *pentry, } elf_header; int data_swab = 0; bool big_endian; - ssize_t ret = -1; + ssize_t ret; Error *err = NULL; load_elf_hdr(info->kernel_filename, &elf_header, &elf_is64, &err); if (err) { + /* + * If the file is not an ELF file we silently return. + * The caller will fall back to try other formats. + */ error_free(err); - return ret; + return -1; } if (elf_is64) { @@ -839,6 +843,8 @@ static ssize_t arm_load_elf(struct arm_boot_info *info, uint64_t *pentry, 1, data_swab, as); if (ret <= 0) { /* The header loaded but the image didn't */ + error_report("Couldn't load elf '%s': %s", + info->kernel_filename, load_elf_strerror(ret)); exit(1); } diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c index ae37a92..e3195d5 100644 --- a/hw/arm/sbsa-ref.c +++ b/hw/arm/sbsa-ref.c @@ -164,23 +164,20 @@ static uint64_t sbsa_ref_cpu_mp_affinity(SBSAMachineState *sms, int idx) static void sbsa_fdt_add_gic_node(SBSAMachineState *sms) { - char *nodename; + const char *intc_nodename = "/intc"; + const char *its_nodename = "/intc/its"; - nodename = g_strdup_printf("/intc"); - qemu_fdt_add_subnode(sms->fdt, nodename); - qemu_fdt_setprop_sized_cells(sms->fdt, nodename, "reg", + qemu_fdt_add_subnode(sms->fdt, intc_nodename); + qemu_fdt_setprop_sized_cells(sms->fdt, intc_nodename, "reg", 2, sbsa_ref_memmap[SBSA_GIC_DIST].base, 2, sbsa_ref_memmap[SBSA_GIC_DIST].size, 2, sbsa_ref_memmap[SBSA_GIC_REDIST].base, 2, sbsa_ref_memmap[SBSA_GIC_REDIST].size); - nodename = g_strdup_printf("/intc/its"); - qemu_fdt_add_subnode(sms->fdt, nodename); - qemu_fdt_setprop_sized_cells(sms->fdt, nodename, "reg", + qemu_fdt_add_subnode(sms->fdt, its_nodename); + qemu_fdt_setprop_sized_cells(sms->fdt, its_nodename, "reg", 2, sbsa_ref_memmap[SBSA_GIC_ITS].base, 2, sbsa_ref_memmap[SBSA_GIC_ITS].size); - - g_free(nodename); } /* @@ -621,6 +618,7 @@ static void create_smmu(const SBSAMachineState *sms, PCIBus *bus) dev = qdev_new(TYPE_ARM_SMMUV3); + object_property_set_str(OBJECT(dev), "stage", "nested", &error_abort); object_property_set_link(OBJECT(dev), "primary-bus", OBJECT(bus), &error_abort); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 3971976..4c49b5a 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1981,6 +1981,7 @@ static Property smmuv3_properties[] = { * Stages of translation advertised. * "1": Stage 1 * "2": Stage 2 + * "nested": Both stage 1 and stage 2 * Defaults to stage 1 */ DEFINE_PROP_STRING("stage", SMMUv3State, stage), diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 687fe0b..7934b23 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1408,6 +1408,7 @@ static void create_pcie_irq_map(const MachineState *ms, static void create_smmu(const VirtMachineState *vms, PCIBus *bus) { + VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); char *node; const char compat[] = "arm,smmu-v3"; int irq = vms->irqmap[VIRT_SMMU]; @@ -1424,6 +1425,9 @@ static void create_smmu(const VirtMachineState *vms, dev = qdev_new(TYPE_ARM_SMMUV3); + if (!vmc->no_nested_smmu) { + object_property_set_str(OBJECT(dev), "stage", "nested", &error_fatal); + } object_property_set_link(OBJECT(dev), "primary-bus", OBJECT(bus), &error_abort); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); @@ -3301,10 +3305,21 @@ static void machvirt_machine_init(void) } type_init(machvirt_machine_init); +static void virt_machine_9_2_options(MachineClass *mc) +{ +} +DEFINE_VIRT_MACHINE_AS_LATEST(9, 2) + static void virt_machine_9_1_options(MachineClass *mc) { + VirtMachineClass *vmc = VIRT_MACHINE_CLASS(OBJECT_CLASS(mc)); + + virt_machine_9_2_options(mc); + compat_props_add(mc->compat_props, hw_compat_9_1, hw_compat_9_1_len); + /* 9.1 and earlier have only a stage-1 SMMU, not a nested s1+2 one */ + vmc->no_nested_smmu = true; } -DEFINE_VIRT_MACHINE_AS_LATEST(9, 1) +DEFINE_VIRT_MACHINE(9, 1) static void virt_machine_9_0_options(MachineClass *mc) { diff --git a/hw/arm/xilinx_zynq.c b/hw/arm/xilinx_zynq.c index 3c56b9a..37c234f 100644 --- a/hw/arm/xilinx_zynq.c +++ b/hw/arm/xilinx_zynq.c @@ -219,14 +219,6 @@ static void zynq_init(MachineState *machine) for (n = 0; n < smp_cpus; n++) { Object *cpuobj = object_new(machine->cpu_type); - /* - * By default A9 CPUs have EL3 enabled. This board does not currently - * support EL3 so the CPU EL3 property is disabled before realization. - */ - if (object_property_find(cpuobj, "has_el3")) { - object_property_set_bool(cpuobj, "has_el3", false, &error_fatal); - } - object_property_set_int(cpuobj, "midr", ZYNQ_BOARD_MIDR, &error_fatal); object_property_set_int(cpuobj, "reset-cbar", MPCORE_PERIPHBASE, diff --git a/hw/core/machine.c b/hw/core/machine.c index 27dcda0..adaba17 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -34,6 +34,9 @@ #include "hw/virtio/virtio-iommu.h" #include "audio/audio.h" +GlobalProperty hw_compat_9_1[] = {}; +const size_t hw_compat_9_1_len = G_N_ELEMENTS(hw_compat_9_1); + GlobalProperty hw_compat_9_0[] = { {"arm-cpu", "backcompat-cntfrq", "true" }, { "scsi-hd", "migrate-emulated-scsi-request", "false" }, diff --git a/hw/core/platform-bus.c b/hw/core/platform-bus.c index b8487b2..dc58bf5 100644 --- a/hw/core/platform-bus.c +++ b/hw/core/platform-bus.c @@ -145,9 +145,12 @@ static void platform_bus_map_mmio(PlatformBusDevice *pbus, SysBusDevice *sbdev, * the target device's memory region */ for (off = 0; off < pbus->mmio_size; off += alignment) { - if (!memory_region_find(&pbus->mmio, off, size).mr) { + MemoryRegion *mr = memory_region_find(&pbus->mmio, off, size).mr; + if (!mr) { found_region = true; break; + } else { + memory_region_unref(mr); } } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 7779c88..ba0ff51 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -79,6 +79,9 @@ { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\ { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, }, +GlobalProperty pc_compat_9_1[] = {}; +const size_t pc_compat_9_1_len = G_N_ELEMENTS(pc_compat_9_1); + GlobalProperty pc_compat_9_0[] = { { TYPE_X86_CPU, "x-amd-topoext-features-only", "false" }, { TYPE_X86_CPU, "x-l1-cache-per-thread", "false" }, diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 347afa4..2bf6865 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -474,13 +474,24 @@ static void pc_i440fx_machine_options(MachineClass *m) "Use a different south bridge than PIIX3"); } -static void pc_i440fx_machine_9_1_options(MachineClass *m) +static void pc_i440fx_machine_9_2_options(MachineClass *m) { pc_i440fx_machine_options(m); m->alias = "pc"; m->is_default = true; } +DEFINE_I440FX_MACHINE(9, 2); + +static void pc_i440fx_machine_9_1_options(MachineClass *m) +{ + pc_i440fx_machine_9_2_options(m); + m->alias = NULL; + m->is_default = false; + compat_props_add(m->compat_props, hw_compat_9_1, hw_compat_9_1_len); + compat_props_add(m->compat_props, pc_compat_9_1, pc_compat_9_1_len); +} + DEFINE_I440FX_MACHINE(9, 1); static void pc_i440fx_machine_9_0_options(MachineClass *m) @@ -488,8 +499,6 @@ static void pc_i440fx_machine_9_0_options(MachineClass *m) PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_i440fx_machine_9_1_options(m); - m->alias = NULL; - m->is_default = false; m->smbios_memory_device_size = 16 * GiB; compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index f2d8edf..8319b6d 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -356,19 +356,28 @@ static void pc_q35_machine_options(MachineClass *m) pc_q35_compat_defaults, pc_q35_compat_defaults_len); } -static void pc_q35_machine_9_1_options(MachineClass *m) +static void pc_q35_machine_9_2_options(MachineClass *m) { pc_q35_machine_options(m); m->alias = "q35"; } +DEFINE_Q35_MACHINE(9, 2); + +static void pc_q35_machine_9_1_options(MachineClass *m) +{ + pc_q35_machine_9_2_options(m); + m->alias = NULL; + compat_props_add(m->compat_props, hw_compat_9_1, hw_compat_9_1_len); + compat_props_add(m->compat_props, pc_compat_9_1, pc_compat_9_1_len); +} + DEFINE_Q35_MACHINE(9, 1); static void pc_q35_machine_9_0_options(MachineClass *m) { PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_q35_machine_9_1_options(m); - m->alias = NULL; m->smbios_memory_device_size = 16 * GiB; compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len); compat_props_add(m->compat_props, pc_compat_9_0, pc_compat_9_0_len); diff --git a/hw/m68k/virt.c b/hw/m68k/virt.c index cda199a..ea5c4a5 100644 --- a/hw/m68k/virt.c +++ b/hw/m68k/virt.c @@ -366,10 +366,17 @@ type_init(virt_machine_register_types) #define DEFINE_VIRT_MACHINE(major, minor) \ DEFINE_VIRT_MACHINE_IMPL(false, major, minor) +static void virt_machine_9_2_options(MachineClass *mc) +{ +} +DEFINE_VIRT_MACHINE_AS_LATEST(9, 2) + static void virt_machine_9_1_options(MachineClass *mc) { + virt_machine_9_2_options(mc); + compat_props_add(mc->compat_props, hw_compat_9_1, hw_compat_9_1_len); } -DEFINE_VIRT_MACHINE_AS_LATEST(9, 1) +DEFINE_VIRT_MACHINE(9, 1) static void virt_machine_9_0_options(MachineClass *mc) { diff --git a/hw/misc/xlnx-versal-cfu.c b/hw/misc/xlnx-versal-cfu.c index 6bb82e5..2284b40 100644 --- a/hw/misc/xlnx-versal-cfu.c +++ b/hw/misc/xlnx-versal-cfu.c @@ -397,6 +397,13 @@ static void cfu_fdro_init(Object *obj) fifo32_create(&s->fdro_data, 8 * KiB / sizeof(uint32_t)); } +static void cfu_fdro_finalize(Object *obj) +{ + XlnxVersalCFUFDRO *s = XLNX_VERSAL_CFU_FDRO(obj); + + fifo32_destroy(&s->fdro_data); +} + static void cfu_fdro_reset_enter(Object *obj, ResetType type) { XlnxVersalCFUFDRO *s = XLNX_VERSAL_CFU_FDRO(obj); @@ -539,6 +546,7 @@ static const TypeInfo cfu_fdro_info = { .instance_size = sizeof(XlnxVersalCFUFDRO), .class_init = cfu_fdro_class_init, .instance_init = cfu_fdro_init, + .instance_finalize = cfu_fdro_finalize, .interfaces = (InterfaceInfo[]) { { TYPE_XLNX_CFI_IF }, { } diff --git a/hw/misc/xlnx-versal-trng.c b/hw/misc/xlnx-versal-trng.c index 51eb760..8690547 100644 --- a/hw/misc/xlnx-versal-trng.c +++ b/hw/misc/xlnx-versal-trng.c @@ -608,9 +608,8 @@ static void trng_init(Object *obj) { XlnxVersalTRng *s = XLNX_VERSAL_TRNG(obj); SysBusDevice *sbd = SYS_BUS_DEVICE(obj); - RegisterInfoArray *reg_array; - reg_array = + s->reg_array = register_init_block32(DEVICE(obj), trng_regs_info, ARRAY_SIZE(trng_regs_info), s->regs_info, s->regs, @@ -618,16 +617,17 @@ static void trng_init(Object *obj) XLNX_VERSAL_TRNG_ERR_DEBUG, R_MAX * 4); - sysbus_init_mmio(sbd, ®_array->mem); + sysbus_init_mmio(sbd, &s->reg_array->mem); sysbus_init_irq(sbd, &s->irq); s->prng = g_rand_new(); } -static void trng_unrealize(DeviceState *dev) +static void trng_finalize(Object *obj) { - XlnxVersalTRng *s = XLNX_VERSAL_TRNG(dev); + XlnxVersalTRng *s = XLNX_VERSAL_TRNG(obj); + register_finalize_block(s->reg_array); g_rand_free(s->prng); s->prng = NULL; } @@ -689,7 +689,6 @@ static void trng_class_init(ObjectClass *klass, void *data) ResettableClass *rc = RESETTABLE_CLASS(klass); dc->vmsd = &vmstate_trng; - dc->unrealize = trng_unrealize; rc->phases.hold = trng_reset_hold; /* Clone uint64 property with set allowed after realized */ @@ -706,6 +705,7 @@ static const TypeInfo trng_info = { .instance_size = sizeof(XlnxVersalTRng), .class_init = trng_class_init, .instance_init = trng_init, + .instance_finalize = trng_finalize, }; static void trng_register_types(void) diff --git a/hw/nvram/xlnx-bbram.c b/hw/nvram/xlnx-bbram.c index 09575a7..1bc58e9 100644 --- a/hw/nvram/xlnx-bbram.c +++ b/hw/nvram/xlnx-bbram.c @@ -456,9 +456,8 @@ static void bbram_ctrl_init(Object *obj) { XlnxBBRam *s = XLNX_BBRAM(obj); SysBusDevice *sbd = SYS_BUS_DEVICE(obj); - RegisterInfoArray *reg_array; - reg_array = + s->reg_array = register_init_block32(DEVICE(obj), bbram_ctrl_regs_info, ARRAY_SIZE(bbram_ctrl_regs_info), s->regs_info, s->regs, @@ -466,10 +465,17 @@ static void bbram_ctrl_init(Object *obj) XLNX_BBRAM_ERR_DEBUG, R_MAX * 4); - sysbus_init_mmio(sbd, ®_array->mem); + sysbus_init_mmio(sbd, &s->reg_array->mem); sysbus_init_irq(sbd, &s->irq_bbram); } +static void bbram_ctrl_finalize(Object *obj) +{ + XlnxBBRam *s = XLNX_BBRAM(obj); + + register_finalize_block(s->reg_array); +} + static void bbram_prop_set_drive(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { @@ -537,6 +543,7 @@ static const TypeInfo bbram_ctrl_info = { .instance_size = sizeof(XlnxBBRam), .class_init = bbram_ctrl_class_init, .instance_init = bbram_ctrl_init, + .instance_finalize = bbram_ctrl_finalize, }; static void bbram_ctrl_register_types(void) diff --git a/hw/nvram/xlnx-versal-efuse-ctrl.c b/hw/nvram/xlnx-versal-efuse-ctrl.c index def6fe33..8252a5c 100644 --- a/hw/nvram/xlnx-versal-efuse-ctrl.c +++ b/hw/nvram/xlnx-versal-efuse-ctrl.c @@ -712,9 +712,8 @@ static void efuse_ctrl_init(Object *obj) { XlnxVersalEFuseCtrl *s = XLNX_VERSAL_EFUSE_CTRL(obj); SysBusDevice *sbd = SYS_BUS_DEVICE(obj); - RegisterInfoArray *reg_array; - reg_array = + s->reg_array = register_init_block32(DEVICE(obj), efuse_ctrl_regs_info, ARRAY_SIZE(efuse_ctrl_regs_info), s->regs_info, s->regs, @@ -722,7 +721,7 @@ static void efuse_ctrl_init(Object *obj) XLNX_VERSAL_EFUSE_CTRL_ERR_DEBUG, R_MAX * 4); - sysbus_init_mmio(sbd, ®_array->mem); + sysbus_init_mmio(sbd, &s->reg_array->mem); sysbus_init_irq(sbd, &s->irq_efuse_imr); } @@ -730,6 +729,7 @@ static void efuse_ctrl_finalize(Object *obj) { XlnxVersalEFuseCtrl *s = XLNX_VERSAL_EFUSE_CTRL(obj); + register_finalize_block(s->reg_array); g_free(s->extra_pg0_lock_spec); } diff --git a/hw/nvram/xlnx-zynqmp-efuse.c b/hw/nvram/xlnx-zynqmp-efuse.c index 2d465f0..4e2d1b9 100644 --- a/hw/nvram/xlnx-zynqmp-efuse.c +++ b/hw/nvram/xlnx-zynqmp-efuse.c @@ -803,9 +803,8 @@ static void zynqmp_efuse_init(Object *obj) { XlnxZynqMPEFuse *s = XLNX_ZYNQMP_EFUSE(obj); SysBusDevice *sbd = SYS_BUS_DEVICE(obj); - RegisterInfoArray *reg_array; - reg_array = + s->reg_array = register_init_block32(DEVICE(obj), zynqmp_efuse_regs_info, ARRAY_SIZE(zynqmp_efuse_regs_info), s->regs_info, s->regs, @@ -813,10 +812,17 @@ static void zynqmp_efuse_init(Object *obj) ZYNQMP_EFUSE_ERR_DEBUG, R_MAX * 4); - sysbus_init_mmio(sbd, ®_array->mem); + sysbus_init_mmio(sbd, &s->reg_array->mem); sysbus_init_irq(sbd, &s->irq); } +static void zynqmp_efuse_finalize(Object *obj) +{ + XlnxZynqMPEFuse *s = XLNX_ZYNQMP_EFUSE(obj); + + register_finalize_block(s->reg_array); +} + static const VMStateDescription vmstate_efuse = { .name = TYPE_XLNX_ZYNQMP_EFUSE, .version_id = 1, @@ -853,6 +859,7 @@ static const TypeInfo efuse_info = { .instance_size = sizeof(XlnxZynqMPEFuse), .class_init = zynqmp_efuse_class_init, .instance_init = zynqmp_efuse_init, + .instance_finalize = zynqmp_efuse_finalize, }; static void efuse_register_types(void) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 370d7c3..8aa3ce7 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -4838,14 +4838,25 @@ static void spapr_machine_latest_class_options(MachineClass *mc) DEFINE_SPAPR_MACHINE_IMPL(false, major, minor, _, tag) /* + * pseries-9.2 + */ +static void spapr_machine_9_2_class_options(MachineClass *mc) +{ + /* Defaults for the latest behaviour inherited from the base class */ +} + +DEFINE_SPAPR_MACHINE_AS_LATEST(9, 2); + +/* * pseries-9.1 */ static void spapr_machine_9_1_class_options(MachineClass *mc) { - /* Defaults for the latest behaviour inherited from the base class */ + spapr_machine_9_2_class_options(mc); + compat_props_add(mc->compat_props, hw_compat_9_1, hw_compat_9_1_len); } -DEFINE_SPAPR_MACHINE_AS_LATEST(9, 1); +DEFINE_SPAPR_MACHINE(9, 1); /* * pseries-9.0 diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index c483ff8..18240a0 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -871,14 +871,26 @@ static const TypeInfo ccw_machine_info = { DEFINE_CCW_MACHINE_IMPL(false, major, minor) +static void ccw_machine_9_2_instance_options(MachineState *machine) +{ +} + +static void ccw_machine_9_2_class_options(MachineClass *mc) +{ +} +DEFINE_CCW_MACHINE_AS_LATEST(9, 2); + static void ccw_machine_9_1_instance_options(MachineState *machine) { + ccw_machine_9_2_instance_options(machine); } static void ccw_machine_9_1_class_options(MachineClass *mc) { + ccw_machine_9_2_class_options(mc); + compat_props_add(mc->compat_props, hw_compat_9_1, hw_compat_9_1_len); } -DEFINE_CCW_MACHINE_AS_LATEST(9, 1); +DEFINE_CCW_MACHINE(9, 1); static void ccw_machine_9_0_instance_options(MachineState *machine) { diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index a4d937e..aca4f80 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -134,6 +134,7 @@ struct VirtMachineClass { bool no_cpu_topology; bool no_tcg_lpa2; bool no_ns_el2_virt_timer_irq; + bool no_nested_smmu; }; struct VirtMachineState { diff --git a/include/hw/boards.h b/include/hw/boards.h index 48ff6d8..9a49277 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -732,6 +732,9 @@ struct MachineState { } \ type_init(machine_initfn##_register_types) +extern GlobalProperty hw_compat_9_1[]; +extern const size_t hw_compat_9_1_len; + extern GlobalProperty hw_compat_9_0[]; extern const size_t hw_compat_9_0_len; diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 4e55d7e..14ee062 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -215,6 +215,9 @@ void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size); /* sgx.c */ void pc_machine_init_sgx_epc(PCMachineState *pcms); +extern GlobalProperty pc_compat_9_1[]; +extern const size_t pc_compat_9_1_len; + extern GlobalProperty pc_compat_9_0[]; extern const size_t pc_compat_9_0_len; diff --git a/include/hw/misc/xlnx-versal-trng.h b/include/hw/misc/xlnx-versal-trng.h index 0bcef8a..d96f8f9 100644 --- a/include/hw/misc/xlnx-versal-trng.h +++ b/include/hw/misc/xlnx-versal-trng.h @@ -50,6 +50,7 @@ typedef struct XlnxVersalTRng { uint64_t forced_prng_count; uint64_t tst_seed[2]; + RegisterInfoArray *reg_array; uint32_t regs[RMAX_XLNX_VERSAL_TRNG]; RegisterInfo regs_info[RMAX_XLNX_VERSAL_TRNG]; } XlnxVersalTRng; diff --git a/include/hw/nvram/xlnx-bbram.h b/include/hw/nvram/xlnx-bbram.h index 6fc13f8..bce8e89 100644 --- a/include/hw/nvram/xlnx-bbram.h +++ b/include/hw/nvram/xlnx-bbram.h @@ -47,6 +47,7 @@ struct XlnxBBRam { bool bbram8_wo; bool blk_ro; + RegisterInfoArray *reg_array; uint32_t regs[RMAX_XLNX_BBRAM]; RegisterInfo regs_info[RMAX_XLNX_BBRAM]; }; diff --git a/include/hw/nvram/xlnx-versal-efuse.h b/include/hw/nvram/xlnx-versal-efuse.h index 86e2261..afa4f4f 100644 --- a/include/hw/nvram/xlnx-versal-efuse.h +++ b/include/hw/nvram/xlnx-versal-efuse.h @@ -44,6 +44,7 @@ struct XlnxVersalEFuseCtrl { void *extra_pg0_lock_spec; /* Opaque property */ uint32_t extra_pg0_lock_n16; + RegisterInfoArray *reg_array; uint32_t regs[XLNX_VERSAL_EFUSE_CTRL_R_MAX]; RegisterInfo regs_info[XLNX_VERSAL_EFUSE_CTRL_R_MAX]; }; diff --git a/include/hw/nvram/xlnx-zynqmp-efuse.h b/include/hw/nvram/xlnx-zynqmp-efuse.h index f5beacc..7fb12df 100644 --- a/include/hw/nvram/xlnx-zynqmp-efuse.h +++ b/include/hw/nvram/xlnx-zynqmp-efuse.h @@ -37,6 +37,7 @@ struct XlnxZynqMPEFuse { qemu_irq irq; XlnxEFuse *efuse; + RegisterInfoArray *reg_array; uint32_t regs[XLNX_ZYNQMP_EFUSE_R_MAX]; RegisterInfo regs_info[XLNX_ZYNQMP_EFUSE_R_MAX]; }; diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h index c59ca10..cfb82c2 100644 --- a/target/arm/cpu-features.h +++ b/target/arm/cpu-features.h @@ -556,6 +556,11 @@ static inline bool isar_feature_aa64_bf16(const ARMISARegisters *id) return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, BF16) != 0; } +static inline bool isar_feature_aa64_ebf16(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, BF16) > 1; +} + static inline bool isar_feature_aa64_rcpc_8_3(const ARMISARegisters *id) { return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, LRCPC) != 0; diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 9a3fd59..f065756 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -1707,6 +1707,7 @@ void vfp_set_fpscr(CPUARMState *env, uint32_t val); #define FPCR_OFE (1 << 10) /* Overflow exception trap enable */ #define FPCR_UFE (1 << 11) /* Underflow exception trap enable */ #define FPCR_IXE (1 << 12) /* Inexact exception trap enable */ +#define FPCR_EBF (1 << 13) /* Extended BFloat16 behaviors */ #define FPCR_IDE (1 << 15) /* Input Denormal exception trap enable */ #define FPCR_LEN_MASK (7 << 16) /* LEN, A-profile only */ #define FPCR_FZ16 (1 << 19) /* ARMv8.2+, FP16 flush-to-zero */ diff --git a/target/arm/helper.h b/target/arm/helper.h index 970d059..b463be3 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -1027,13 +1027,13 @@ DEF_HELPER_FLAGS_5(gvec_ummla_b, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(gvec_usmmla_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_5(gvec_bfdot, TCG_CALL_NO_RWG, - void, ptr, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_5(gvec_bfdot_idx, TCG_CALL_NO_RWG, - void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(gvec_bfdot, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_6(gvec_bfdot_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) -DEF_HELPER_FLAGS_5(gvec_bfmmla, TCG_CALL_NO_RWG, - void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(gvec_bfmmla, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) DEF_HELPER_FLAGS_6(gvec_bfmlal, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32) diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c index fe232eb..79258a7 100644 --- a/target/arm/tcg/cpu64.c +++ b/target/arm/tcg/cpu64.c @@ -1160,7 +1160,7 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64ISAR1, FRINTTS, 1); /* FEAT_FRINTTS */ t = FIELD_DP64(t, ID_AA64ISAR1, SB, 1); /* FEAT_SB */ t = FIELD_DP64(t, ID_AA64ISAR1, SPECRES, 1); /* FEAT_SPECRES */ - t = FIELD_DP64(t, ID_AA64ISAR1, BF16, 1); /* FEAT_BF16 */ + t = FIELD_DP64(t, ID_AA64ISAR1, BF16, 2); /* FEAT_BF16, FEAT_EBF16 */ t = FIELD_DP64(t, ID_AA64ISAR1, DGH, 1); /* FEAT_DGH */ t = FIELD_DP64(t, ID_AA64ISAR1, I8MM, 1); /* FEAT_I8MM */ cpu->isar.id_aa64isar1 = t; @@ -1244,7 +1244,7 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64ZFR0, SVEVER, 1); t = FIELD_DP64(t, ID_AA64ZFR0, AES, 2); /* FEAT_SVE_PMULL128 */ t = FIELD_DP64(t, ID_AA64ZFR0, BITPERM, 1); /* FEAT_SVE_BitPerm */ - t = FIELD_DP64(t, ID_AA64ZFR0, BFLOAT16, 1); /* FEAT_BF16 */ + t = FIELD_DP64(t, ID_AA64ZFR0, BFLOAT16, 2); /* FEAT_BF16, FEAT_EBF16 */ t = FIELD_DP64(t, ID_AA64ZFR0, SHA3, 1); /* FEAT_SVE_SHA3 */ t = FIELD_DP64(t, ID_AA64ZFR0, SM4, 1); /* FEAT_SVE_SM4 */ t = FIELD_DP64(t, ID_AA64ZFR0, I8MM, 1); /* FEAT_I8MM */ diff --git a/target/arm/tcg/helper-sme.h b/target/arm/tcg/helper-sme.h index d22bf9d..59ecaa1 100644 --- a/target/arm/tcg/helper-sme.h +++ b/target/arm/tcg/helper-sme.h @@ -126,8 +126,8 @@ DEF_HELPER_FLAGS_7(sme_fmopa_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_7(sme_fmopa_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_6(sme_bfmopa, TCG_CALL_NO_RWG, - void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_7(sme_bfmopa, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, env, i32) DEF_HELPER_FLAGS_6(sme_smopa_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_6(sme_umopa_s, TCG_CALL_NO_RWG, diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c index 0210680..8cf1265 100644 --- a/target/arm/tcg/sme_helper.c +++ b/target/arm/tcg/sme_helper.c @@ -1079,38 +1079,68 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, } } -void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn, - void *vpm, uint32_t desc) +void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, + void *vpn, void *vpm, CPUARMState *env, uint32_t desc) { intptr_t row, col, oprsz = simd_maxsz(desc); uint32_t neg = simd_data(desc) * 0x80008000u; uint16_t *pn = vpn, *pm = vpm; + float_status fpst, fpst_odd; - for (row = 0; row < oprsz; ) { - uint16_t prow = pn[H2(row >> 4)]; - do { - void *vza_row = vza + tile_vslice_offset(row); - uint32_t n = *(uint32_t *)(vzn + H1_4(row)); + if (is_ebf(env, &fpst, &fpst_odd)) { + for (row = 0; row < oprsz; ) { + uint16_t prow = pn[H2(row >> 4)]; + do { + void *vza_row = vza + tile_vslice_offset(row); + uint32_t n = *(uint32_t *)(vzn + H1_4(row)); - n = f16mop_adj_pair(n, prow, neg); + n = f16mop_adj_pair(n, prow, neg); - for (col = 0; col < oprsz; ) { - uint16_t pcol = pm[H2(col >> 4)]; - do { - if (prow & pcol & 0b0101) { - uint32_t *a = vza_row + H1_4(col); - uint32_t m = *(uint32_t *)(vzm + H1_4(col)); + for (col = 0; col < oprsz; ) { + uint16_t pcol = pm[H2(col >> 4)]; + do { + if (prow & pcol & 0b0101) { + uint32_t *a = vza_row + H1_4(col); + uint32_t m = *(uint32_t *)(vzm + H1_4(col)); - m = f16mop_adj_pair(m, pcol, 0); - *a = bfdotadd(*a, n, m); - } - col += 4; - pcol >>= 4; - } while (col & 15); - } - row += 4; - prow >>= 4; - } while (row & 15); + m = f16mop_adj_pair(m, pcol, 0); + *a = bfdotadd_ebf(*a, n, m, &fpst, &fpst_odd); + } + col += 4; + pcol >>= 4; + } while (col & 15); + } + row += 4; + prow >>= 4; + } while (row & 15); + } + } else { + for (row = 0; row < oprsz; ) { + uint16_t prow = pn[H2(row >> 4)]; + do { + void *vza_row = vza + tile_vslice_offset(row); + uint32_t n = *(uint32_t *)(vzn + H1_4(row)); + + n = f16mop_adj_pair(n, prow, neg); + + for (col = 0; col < oprsz; ) { + uint16_t pcol = pm[H2(col >> 4)]; + do { + if (prow & pcol & 0b0101) { + uint32_t *a = vza_row + H1_4(col); + uint32_t m = *(uint32_t *)(vzm + H1_4(col)); + + m = f16mop_adj_pair(m, pcol, 0); + *a = bfdotadd(*a, n, m, &fpst); + } + col += 4; + pcol >>= 4; + } while (col & 15); + } + row += 4; + prow >>= 4; + } while (row & 15); + } } } diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index 4684e7e..6d5f12e 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -736,6 +736,22 @@ static void gen_gvec_op4_ool(DisasContext *s, bool is_q, int rd, int rn, } /* + * Expand a 4-operand operation using an out-of-line helper that takes + * a pointer to the CPU env. + */ +static void gen_gvec_op4_env(DisasContext *s, bool is_q, int rd, int rn, + int rm, int ra, int data, + gen_helper_gvec_4_ptr *fn) +{ + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, ra), + tcg_env, + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); +} + +/* * Expand a 4-operand + fpstatus pointer + simd data value operation using * an out-of-line helper. */ @@ -5608,11 +5624,20 @@ static bool do_dot_vector(DisasContext *s, arg_qrrr_e *a, return true; } +static bool do_dot_vector_env(DisasContext *s, arg_qrrr_e *a, + gen_helper_gvec_4_ptr *fn) +{ + if (fp_access_check(s)) { + gen_gvec_op4_env(s, a->q, a->rd, a->rn, a->rm, a->rd, 0, fn); + } + return true; +} + TRANS_FEAT(SDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_sdot_b) TRANS_FEAT(UDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_udot_b) TRANS_FEAT(USDOT_v, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_usdot_b) -TRANS_FEAT(BFDOT_v, aa64_bf16, do_dot_vector, a, gen_helper_gvec_bfdot) -TRANS_FEAT(BFMMLA, aa64_bf16, do_dot_vector, a, gen_helper_gvec_bfmmla) +TRANS_FEAT(BFDOT_v, aa64_bf16, do_dot_vector_env, a, gen_helper_gvec_bfdot) +TRANS_FEAT(BFMMLA, aa64_bf16, do_dot_vector_env, a, gen_helper_gvec_bfmmla) TRANS_FEAT(SMMLA, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_smmla_b) TRANS_FEAT(UMMLA, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_ummla_b) TRANS_FEAT(USMMLA, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_usmmla_b) @@ -6385,13 +6410,22 @@ static bool do_dot_vector_idx(DisasContext *s, arg_qrrx_e *a, return true; } +static bool do_dot_vector_idx_env(DisasContext *s, arg_qrrx_e *a, + gen_helper_gvec_4_ptr *fn) +{ + if (fp_access_check(s)) { + gen_gvec_op4_env(s, a->q, a->rd, a->rn, a->rm, a->rd, a->idx, fn); + } + return true; +} + TRANS_FEAT(SDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_sdot_idx_b) TRANS_FEAT(UDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_udot_idx_b) TRANS_FEAT(SUDOT_vi, aa64_i8mm, do_dot_vector_idx, a, gen_helper_gvec_sudot_idx_b) TRANS_FEAT(USDOT_vi, aa64_i8mm, do_dot_vector_idx, a, gen_helper_gvec_usdot_idx_b) -TRANS_FEAT(BFDOT_vi, aa64_bf16, do_dot_vector_idx, a, +TRANS_FEAT(BFDOT_vi, aa64_bf16, do_dot_vector_idx_env, a, gen_helper_gvec_bfdot_idx) static bool trans_BFMLAL_vi(DisasContext *s, arg_qrrx_e *a) diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c index 915c9e5..13cd31a 100644 --- a/target/arm/tcg/translate-neon.c +++ b/target/arm/tcg/translate-neon.c @@ -148,6 +148,37 @@ static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm, return true; } +static bool do_neon_ddda_env(DisasContext *s, int q, int vd, int vn, int vm, + int data, gen_helper_gvec_4_ptr *fn_gvec) +{ + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) { + return false; + } + + /* + * UNDEF accesses to odd registers for each bit of Q. + * Q will be 0b111 for all Q-reg instructions, otherwise + * when we have mixed Q- and D-reg inputs. + */ + if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + int opr_sz = q ? 16 : 8; + tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd), + vfp_reg_offset(1, vn), + vfp_reg_offset(1, vm), + vfp_reg_offset(1, vd), + tcg_env, + opr_sz, opr_sz, data, fn_gvec); + return true; +} + static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm, int data, ARMFPStatusFlavour fp_flavour, gen_helper_gvec_4_ptr *fn_gvec_ptr) @@ -266,8 +297,8 @@ static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a) if (!dc_isar_feature(aa32_bf16, s)) { return false; } - return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0, - gen_helper_gvec_bfdot); + return do_neon_ddda_env(s, a->q * 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_bfdot); } static bool trans_VFML(DisasContext *s, arg_VFML *a) @@ -360,8 +391,8 @@ static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a) if (!dc_isar_feature(aa32_bf16, s)) { return false; } - return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, - gen_helper_gvec_bfdot_idx); + return do_neon_ddda_env(s, a->q * 6, a->vd, a->vn, a->vm, a->index, + gen_helper_gvec_bfdot_idx); } static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a) @@ -3699,8 +3730,8 @@ static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a) if (!dc_isar_feature(aa32_bf16, s)) { return false; } - return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0, - gen_helper_gvec_bfmmla); + return do_neon_ddda_env(s, 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_bfmmla); } static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a) diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c index ae42dde..01ece57 100644 --- a/target/arm/tcg/translate-sme.c +++ b/target/arm/tcg/translate-sme.c @@ -362,8 +362,7 @@ TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, FPST_FPCR, gen_helper_sme_fmopa_d) -/* TODO: FEAT_EBF16 */ -TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa) +TRANS_FEAT(BFMOPA, aa64_sme, do_outprod_env, a, MO_32, gen_helper_sme_bfmopa) TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s) TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s) diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index a72c262..9e2536d 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -252,6 +252,25 @@ static bool gen_gvec_fpst_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn, return ret; } +static bool gen_gvec_env_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn, + int rd, int rn, int rm, int ra, + int data) +{ + return gen_gvec_ptr_zzzz(s, fn, rd, rn, rm, ra, data, tcg_env); +} + +static bool gen_gvec_env_arg_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn, + arg_rrrr_esz *a, int data) +{ + return gen_gvec_env_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, data); +} + +static bool gen_gvec_env_arg_zzxz(DisasContext *s, gen_helper_gvec_4_ptr *fn, + arg_rrxr_esz *a) +{ + return gen_gvec_env_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, a->index); +} + /* Invoke an out-of-line helper on 4 Zregs, 1 Preg, plus fpst. */ static bool gen_gvec_fpst_zzzzp(DisasContext *s, gen_helper_gvec_5_ptr *fn, int rd, int rn, int rm, int ra, int pg, @@ -7113,12 +7132,12 @@ TRANS_FEAT_NONSTREAMING(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, TRANS_FEAT_NONSTREAMING(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, gen_helper_gvec_ummla_b, a, 0) -TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_ool_arg_zzzz, +TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_env_arg_zzzz, gen_helper_gvec_bfdot, a, 0) -TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_ool_arg_zzxz, +TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_env_arg_zzxz, gen_helper_gvec_bfdot_idx, a) -TRANS_FEAT_NONSTREAMING(BFMMLA, aa64_sve_bf16, gen_gvec_ool_arg_zzzz, +TRANS_FEAT_NONSTREAMING(BFMMLA, aa64_sve_bf16, gen_gvec_env_arg_zzzz, gen_helper_gvec_bfmmla, a, 0) static bool do_BFMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel) diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c index cd5b848..b6fa28a 100644 --- a/target/arm/tcg/translate-vfp.c +++ b/target/arm/tcg/translate-vfp.c @@ -2190,8 +2190,8 @@ static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d) static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d) { /* - * VFNMA : fd = muladd(-fd, fn, fm) - * VFNMS : fd = muladd(-fd, -fn, fm) + * VFNMA : fd = muladd(-fd, -fn, fm) + * VFNMS : fd = muladd(-fd, fn, fm) * VFMA : fd = muladd( fd, fn, fm) * VFMS : fd = muladd( fd, -fn, fm) * @@ -2262,8 +2262,8 @@ static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d) #define MAKE_VFM_TRANS_FNS(PREC) \ MAKE_ONE_VFM_TRANS_FN(VFMA, PREC, false, false) \ MAKE_ONE_VFM_TRANS_FN(VFMS, PREC, true, false) \ - MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, false, true) \ - MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, true, true) + MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, false, true) \ + MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, true, true) MAKE_VFM_TRANS_FNS(hp) MAKE_VFM_TRANS_FNS(sp) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 98604d1..22ddb96 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -2790,44 +2790,115 @@ DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) * BFloat16 Dot Product */ -float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) +bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) { - /* FPCR is ignored for BFDOT and BFMMLA. */ - float_status bf_status = { + /* + * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. + * For EBF = 0, we ignore the FPCR bits which determine rounding + * mode and denormal-flushing, and we do unfused multiplies and + * additions with intermediate rounding of all products and sums. + * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, + * and we perform a fused two-way sum-of-products without intermediate + * rounding of the products. + * In either case, we don't set fp exception flags. + * + * EBF is AArch64 only, so even if it's set in the FPCR it has + * no effect on AArch32 instructions. + */ + bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; + *statusp = (float_status){ .tininess_before_rounding = float_tininess_before_rounding, .float_rounding_mode = float_round_to_odd_inf, .flush_to_zero = true, .flush_inputs_to_zero = true, .default_nan_mode = true, }; + + if (ebf) { + float_status *fpst = &env->vfp.fp_status; + set_flush_to_zero(get_flush_to_zero(fpst), statusp); + set_flush_inputs_to_zero(get_flush_inputs_to_zero(fpst), statusp); + set_float_rounding_mode(get_float_rounding_mode(fpst), statusp); + + /* EBF=1 needs to do a step with round-to-odd semantics */ + *oddstatusp = *statusp; + set_float_rounding_mode(float_round_to_odd, oddstatusp); + } + + return ebf; +} + +float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) +{ float32 t1, t2; /* * Extract each BFloat16 from the element pair, and shift * them such that they become float32. */ - t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); - t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); - t1 = float32_add(t1, t2, &bf_status); - t1 = float32_add(sum, t1, &bf_status); + t1 = float32_mul(e1 << 16, e2 << 16, fpst); + t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); + t1 = float32_add(t1, t2, fpst); + t1 = float32_add(sum, t1, fpst); return t1; } -void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) +float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, + float_status *fpst, float_status *fpst_odd) +{ + /* + * Compare f16_dotadd() in sme_helper.c, but here we have + * bfloat16 inputs. In particular that means that we do not + * want the FPCR.FZ16 flush semantics, so we use the normal + * float_status for the input handling here. + */ + float64 e1r = float32_to_float64(e1 << 16, fpst); + float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); + float64 e2r = float32_to_float64(e2 << 16, fpst); + float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); + float64 t64; + float32 t32; + + /* + * The ARM pseudocode function FPDot performs both multiplies + * and the add with a single rounding operation. Emulate this + * by performing the first multiply in round-to-odd, then doing + * the second multiply as fused multiply-add, and rounding to + * float32 all in one step. + */ + t64 = float64_mul(e1r, e2r, fpst_odd); + t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); + + /* This conversion is exact, because we've already rounded. */ + t32 = float64_to_float32(t64, fpst); + + /* The final accumulation step is not fused. */ + return float32_add(sum, t32, fpst); +} + +void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, + CPUARMState *env, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc); float32 *d = vd, *a = va; uint32_t *n = vn, *m = vm; + float_status fpst, fpst_odd; - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = bfdotadd(a[i], n[i], m[i]); + if (is_ebf(env, &fpst, &fpst_odd)) { + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); + } + } else { + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = bfdotadd(a[i], n[i], m[i], &fpst); + } } clear_tail(d, opr_sz, simd_maxsz(desc)); } void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) + void *va, CPUARMState *env, uint32_t desc) { intptr_t i, j, opr_sz = simd_oprsz(desc); intptr_t index = simd_data(desc); @@ -2835,53 +2906,100 @@ void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, intptr_t eltspersegment = MIN(16 / 4, elements); float32 *d = vd, *a = va; uint32_t *n = vn, *m = vm; + float_status fpst, fpst_odd; - for (i = 0; i < elements; i += eltspersegment) { - uint32_t m_idx = m[i + H4(index)]; + if (is_ebf(env, &fpst, &fpst_odd)) { + for (i = 0; i < elements; i += eltspersegment) { + uint32_t m_idx = m[i + H4(index)]; - for (j = i; j < i + eltspersegment; j++) { - d[j] = bfdotadd(a[j], n[j], m_idx); + for (j = i; j < i + eltspersegment; j++) { + d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); + } + } + } else { + for (i = 0; i < elements; i += eltspersegment) { + uint32_t m_idx = m[i + H4(index)]; + + for (j = i; j < i + eltspersegment; j++) { + d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); + } } } clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) +void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, + CPUARMState *env, uint32_t desc) { intptr_t s, opr_sz = simd_oprsz(desc); float32 *d = vd, *a = va; uint32_t *n = vn, *m = vm; + float_status fpst, fpst_odd; - for (s = 0; s < opr_sz / 4; s += 4) { - float32 sum00, sum01, sum10, sum11; + if (is_ebf(env, &fpst, &fpst_odd)) { + for (s = 0; s < opr_sz / 4; s += 4) { + float32 sum00, sum01, sum10, sum11; - /* - * Process the entire segment at once, writing back the - * results only after we've consumed all of the inputs. - * - * Key to indices by column: - * i j i k j k - */ - sum00 = a[s + H4(0 + 0)]; - sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); - sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); - - sum01 = a[s + H4(0 + 1)]; - sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); - sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); - - sum10 = a[s + H4(2 + 0)]; - sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); - sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); - - sum11 = a[s + H4(2 + 1)]; - sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); - sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); - - d[s + H4(0 + 0)] = sum00; - d[s + H4(0 + 1)] = sum01; - d[s + H4(2 + 0)] = sum10; - d[s + H4(2 + 1)] = sum11; + /* + * Process the entire segment at once, writing back the + * results only after we've consumed all of the inputs. + * + * Key to indices by column: + * i j i k j k + */ + sum00 = a[s + H4(0 + 0)]; + sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); + sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); + + sum01 = a[s + H4(0 + 1)]; + sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); + sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); + + sum10 = a[s + H4(2 + 0)]; + sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); + sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); + + sum11 = a[s + H4(2 + 1)]; + sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); + sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); + + d[s + H4(0 + 0)] = sum00; + d[s + H4(0 + 1)] = sum01; + d[s + H4(2 + 0)] = sum10; + d[s + H4(2 + 1)] = sum11; + } + } else { + for (s = 0; s < opr_sz / 4; s += 4) { + float32 sum00, sum01, sum10, sum11; + + /* + * Process the entire segment at once, writing back the + * results only after we've consumed all of the inputs. + * + * Key to indices by column: + * i j i k j k + */ + sum00 = a[s + H4(0 + 0)]; + sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); + sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); + + sum01 = a[s + H4(0 + 1)]; + sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); + sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); + + sum10 = a[s + H4(2 + 0)]; + sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); + sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); + + sum11 = a[s + H4(2 + 1)]; + sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); + sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); + + d[s + H4(0 + 0)] = sum00; + d[s + H4(0 + 1)] = sum01; + d[s + H4(2 + 0)] = sum10; + d[s + H4(2 + 1)] = sum11; + } } clear_tail(d, opr_sz, simd_maxsz(desc)); } diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h index 3ca1b94..094f5c1 100644 --- a/target/arm/tcg/vec_internal.h +++ b/target/arm/tcg/vec_internal.h @@ -223,13 +223,46 @@ int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); * bfdotadd: * @sum: addend * @e1, @e2: multiplicand vectors + * @fpst: floating-point status to use * * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum. * The @e1 and @e2 operands correspond to the 32-bit source vector * slots and contain two Bfloat16 values each. * - * Corresponds to the ARM pseudocode function BFDotAdd. + * Corresponds to the ARM pseudocode function BFDotAdd, specialized + * for the FPCR.EBF == 0 case. */ -float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2); +float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst); +/** + * bfdotadd_ebf: + * @sum: addend + * @e1, @e2: multiplicand vectors + * @fpst: floating-point status to use + * @fpst_odd: floating-point status to use for round-to-odd operations + * + * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum. + * The @e1 and @e2 operands correspond to the 32-bit source vector + * slots and contain two Bfloat16 values each. + * + * Corresponds to the ARM pseudocode function BFDotAdd, specialized + * for the FPCR.EBF == 1 case. + */ +float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, + float_status *fpst, float_status *fpst_odd); + +/** + * is_ebf: + * @env: CPU state + * @statusp: pointer to floating point status to fill in + * @oddstatusp: pointer to floating point status to fill in for round-to-odd + * + * Determine whether a BFDotAdd operation should use FPCR.EBF = 0 + * or FPCR.EBF = 1 semantics. On return, has initialized *statusp + * and *oddstatusp to suitable float_status arguments to use with either + * bfdotadd() or bfdotadd_ebf(). + * Returns true for EBF = 1, false for EBF = 0. (The caller should use this + * to decide whether to call bfdotadd() or bfdotadd_ebf().) + */ +bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp); #endif /* TARGET_ARM_VEC_INTERNAL_H */ diff --git a/target/arm/tcg/vfp.decode b/target/arm/tcg/vfp.decode index 5405e80..2dd87a2 100644 --- a/target/arm/tcg/vfp.decode +++ b/target/arm/tcg/vfp.decode @@ -141,18 +141,18 @@ VDIV_dp ---- 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d VFMA_hp ---- 1110 1.10 .... .... 1001 .0. 0 .... @vfp_dnm_s VFMS_hp ---- 1110 1.10 .... .... 1001 .1. 0 .... @vfp_dnm_s -VFNMA_hp ---- 1110 1.01 .... .... 1001 .0. 0 .... @vfp_dnm_s -VFNMS_hp ---- 1110 1.01 .... .... 1001 .1. 0 .... @vfp_dnm_s +VFNMS_hp ---- 1110 1.01 .... .... 1001 .0. 0 .... @vfp_dnm_s +VFNMA_hp ---- 1110 1.01 .... .... 1001 .1. 0 .... @vfp_dnm_s VFMA_sp ---- 1110 1.10 .... .... 1010 .0. 0 .... @vfp_dnm_s VFMS_sp ---- 1110 1.10 .... .... 1010 .1. 0 .... @vfp_dnm_s -VFNMA_sp ---- 1110 1.01 .... .... 1010 .0. 0 .... @vfp_dnm_s -VFNMS_sp ---- 1110 1.01 .... .... 1010 .1. 0 .... @vfp_dnm_s +VFNMS_sp ---- 1110 1.01 .... .... 1010 .0. 0 .... @vfp_dnm_s +VFNMA_sp ---- 1110 1.01 .... .... 1010 .1. 0 .... @vfp_dnm_s VFMA_dp ---- 1110 1.10 .... .... 1011 .0.0 .... @vfp_dnm_d VFMS_dp ---- 1110 1.10 .... .... 1011 .1.0 .... @vfp_dnm_d -VFNMA_dp ---- 1110 1.01 .... .... 1011 .0.0 .... @vfp_dnm_d -VFNMS_dp ---- 1110 1.01 .... .... 1011 .1.0 .... @vfp_dnm_d +VFNMS_dp ---- 1110 1.01 .... .... 1011 .0.0 .... @vfp_dnm_d +VFNMA_dp ---- 1110 1.01 .... .... 1011 .1.0 .... @vfp_dnm_d VMOV_imm_hp ---- 1110 1.11 .... .... 1001 0000 .... \ vd=%vd_sp imm=%vmov_imm diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c index b3698da..203d373 100644 --- a/target/arm/vfp_helper.c +++ b/target/arm/vfp_helper.c @@ -254,6 +254,10 @@ static void vfp_set_fpcr_masked(CPUARMState *env, uint32_t val, uint32_t mask) val &= ~FPCR_FZ16; } + if (!cpu_isar_feature(aa64_ebf16, cpu)) { + val &= ~FPCR_EBF; + } + vfp_set_fpcr_to_host(env, val, mask); if (mask & (FPCR_LEN_MASK | FPCR_STRIDE_MASK)) { @@ -278,12 +282,12 @@ static void vfp_set_fpcr_masked(CPUARMState *env, uint32_t val, uint32_t mask) * We don't implement trapped exception handling, so the * trap enable bits, IDE|IXE|UFE|OFE|DZE|IOE are all RAZ/WI (not RES0!) * - * The FPCR bits we keep in vfp.fpcr are AHP, DN, FZ, RMode + * The FPCR bits we keep in vfp.fpcr are AHP, DN, FZ, RMode, EBF * and FZ16. Len, Stride and LTPSIZE we just handled. Store those bits * there, and zero any of the other FPCR bits and the RES0 and RAZ/WI * bits. */ - val &= FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE_MASK | FPCR_FZ16; + val &= FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE_MASK | FPCR_FZ16 | FPCR_EBF; env->vfp.fpcr &= ~mask; env->vfp.fpcr |= val; } |