diff options
112 files changed, 4699 insertions, 751 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 0e9f338..6c27914 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1360,6 +1360,18 @@ F: hw/pci-host/mv64361.c F: hw/pci-host/mv643xx.h F: include/hw/pci-host/mv64361.h +Virtual Open Firmware (VOF) +M: Alexey Kardashevskiy <aik@ozlabs.ru> +R: David Gibson <david@gibson.dropbear.id.au> +R: Greg Kurz <groug@kaod.org> +L: qemu-ppc@nongnu.org +S: Maintained +F: hw/ppc/spapr_vof* +F: hw/ppc/vof* +F: include/hw/ppc/vof* +F: pc-bios/vof/* +F: pc-bios/vof* + RISC-V Machines --------------- OpenTitan @@ -1952,6 +1964,15 @@ F: include/sysemu/rng*.h F: backends/rng*.c F: tests/qtest/virtio-rng-test.c +vhost-user-rng +M: Mathieu Poirier <mathieu.poirier@linaro.org> +S: Supported +F: docs/tools/vhost-user-rng.rst +F: hw/virtio/vhost-user-rng.c +F: hw/virtio/vhost-user-rng-pci.c +F: include/hw/virtio/vhost-user-rng.h +F: tools/vhost-user-rng/* + virtio-crypto M: Gonglei <arei.gonglei@huawei.com> S: Supported diff --git a/block/file-posix.c b/block/file-posix.c index a26eab0..cb9bffe 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -46,6 +46,7 @@ #if defined(HAVE_HOST_BLOCK_DEVICE) #include <paths.h> #include <sys/param.h> +#include <sys/mount.h> #include <IOKit/IOKitLib.h> #include <IOKit/IOBSD.h> #include <IOKit/storage/IOMediaBSDClient.h> @@ -1254,6 +1255,15 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) return; } +#if defined(__APPLE__) && (__MACH__) + struct statfs buf; + + if (!fstatfs(s->fd, &buf)) { + bs->bl.opt_transfer = buf.f_iosize; + bs->bl.pdiscard_alignment = buf.f_bsize; + } +#endif + if (bs->sg || S_ISBLK(st.st_mode)) { int ret = hdev_get_max_hw_transfer(s->fd, &st); @@ -1591,6 +1601,7 @@ out: } } +#if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD) static int translate_err(int err) { if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || @@ -1599,6 +1610,7 @@ static int translate_err(int err) } return err; } +#endif #ifdef CONFIG_FALLOCATE static int do_fallocate(int fd, int mode, off_t offset, off_t len) @@ -1811,16 +1823,27 @@ static int handle_aiocb_discard(void *opaque) } } while (errno == EINTR); - ret = -errno; + ret = translate_err(-errno); #endif } else { #ifdef CONFIG_FALLOCATE_PUNCH_HOLE ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, aiocb->aio_offset, aiocb->aio_nbytes); + ret = translate_err(-errno); +#elif defined(__APPLE__) && (__MACH__) + fpunchhole_t fpunchhole; + fpunchhole.fp_flags = 0; + fpunchhole.reserved = 0; + fpunchhole.fp_offset = aiocb->aio_offset; + fpunchhole.fp_length = aiocb->aio_nbytes; + if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) { + ret = errno == ENODEV ? -ENOTSUP : -errno; + } else { + ret = 0; + } #endif } - ret = translate_err(ret); if (ret == -ENOTSUP) { s->has_discard = false; } @@ -125,6 +125,8 @@ void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) { + dst->pdiscard_alignment = MAX(dst->pdiscard_alignment, + src->pdiscard_alignment); dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer, diff --git a/default-configs/devices/ppc-softmmu.mak b/default-configs/devices/ppc-softmmu.mak index c2d4119..4535993 100644 --- a/default-configs/devices/ppc-softmmu.mak +++ b/default-configs/devices/ppc-softmmu.mak @@ -14,7 +14,7 @@ CONFIG_SAM460EX=y CONFIG_MAC_OLDWORLD=y CONFIG_MAC_NEWWORLD=y -CONFIG_PEGASOS2=n +CONFIG_PEGASOS2=y # For PReP CONFIG_PREP=y diff --git a/docs/pcie_pci_bridge.txt b/docs/pcie_pci_bridge.txt index ab35ebf..1aa08fc 100644 --- a/docs/pcie_pci_bridge.txt +++ b/docs/pcie_pci_bridge.txt @@ -70,9 +70,9 @@ A detailed command line would be: [qemu-bin + storage options] \ -m 2G \ --device pcie-root-port,bus=pcie.0,id=rp1 \ --device pcie-root-port,bus=pcie.0,id=rp2 \ --device pcie-root-port,bus=pcie.0,id=rp3,bus-reserve=1 \ +-device pcie-root-port,bus=pcie.0,id=rp1,slot=1 \ +-device pcie-root-port,bus=pcie.0,id=rp2,slot=2 \ +-device pcie-root-port,bus=pcie.0,id=rp3,slot=3,bus-reserve=1 \ -device pcie-pci-bridge,id=br1,bus=rp1 \ -device pcie-pci-bridge,id=br2,bus=rp2 \ -device e1000,bus=br1,addr=8 diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst index 25d6c4c..6d438f1 100644 --- a/docs/system/deprecated.rst +++ b/docs/system/deprecated.rst @@ -221,6 +221,24 @@ This machine is deprecated because we have enough AST2500 based OpenPOWER machines. It can be easily replaced by the ``witherspoon-bmc`` or the ``romulus-bmc`` machines. +Backend options +--------------- + +Using non-persistent backing file with pmem=on (since 6.1) +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''' + +This option is used when ``memory-backend-file`` is consumed by emulated NVDIMM +device. However enabling ``memory-backend-file.pmem`` option, when backing file +is (a) not DAX capable or (b) not on a filesystem that support direct mapping +of persistent memory, is not safe and may lead to data loss or corruption in case +of host crash. +Options are: + + - modify VM configuration to set ``pmem=off`` to continue using fake NVDIMM + (without persistence guaranties) with backing file on non DAX storage + - move backing file to NVDIMM storage and keep ``pmem=on`` + (to have NVDIMM with persistence guaranties). + Device options -------------- diff --git a/docs/system/ppc/ppce500.rst b/docs/system/ppc/ppce500.rst index 7a815c1..afc58f6 100644 --- a/docs/system/ppc/ppce500.rst +++ b/docs/system/ppc/ppce500.rst @@ -19,6 +19,7 @@ The ``ppce500`` machine supports the following devices: * Power-off functionality via one GPIO pin * 1 Freescale MPC8xxx PCI host controller * VirtIO devices via PCI bus +* 1 Freescale Enhanced Triple Speed Ethernet controller (eTSEC) Hardware configuration information ---------------------------------- @@ -121,7 +122,7 @@ To boot the 32-bit Linux kernel: Running U-Boot -------------- -U-Boot mainline v2021.04 release is tested at the time of writing. To build a +U-Boot mainline v2021.07 release is tested at the time of writing. To build a U-Boot mainline bootloader that can be booted by the ``ppce500`` machine, use the qemu-ppce500_defconfig with similar commands as described above for Linux: @@ -154,3 +155,10 @@ interface at PCI address 0.1.0, but we can switch that to an e1000 NIC by: -display none -serial stdio \ -bios u-boot \ -nic tap,ifname=tap0,script=no,downscript=no,model=e1000 + +The QEMU ``ppce500`` machine can also dynamically instantiate an eTSEC device +if “-device eTSEC” is given to QEMU: + +.. code-block:: bash + + -netdev tap,ifname=tap0,script=no,downscript=no,id=net0 -device eTSEC,netdev=net0 diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 39c8257..e28457a 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -207,7 +207,7 @@ static void ged_regs_write(void *opaque, hwaddr addr, uint64_t data, return; case ACPI_GED_REG_RESET: if (data == ACPI_GED_RESET_VALUE) { - qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); + qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); } return; } diff --git a/hw/block/block.c b/hw/block/block.c index 1e34573..d47ebf0 100644 --- a/hw/block/block.c +++ b/hw/block/block.c @@ -65,24 +65,58 @@ bool blkconf_blocksizes(BlockConf *conf, Error **errp) { BlockBackend *blk = conf->blk; BlockSizes blocksizes; - int backend_ret; + BlockDriverState *bs; + bool use_blocksizes; + bool use_bs; + + switch (conf->backend_defaults) { + case ON_OFF_AUTO_AUTO: + use_blocksizes = !blk_probe_blocksizes(blk, &blocksizes); + use_bs = false; + break; + + case ON_OFF_AUTO_ON: + use_blocksizes = !blk_probe_blocksizes(blk, &blocksizes); + bs = blk_bs(blk); + use_bs = bs; + break; + + case ON_OFF_AUTO_OFF: + use_blocksizes = false; + use_bs = false; + break; + + default: + abort(); + } - backend_ret = blk_probe_blocksizes(blk, &blocksizes); /* fill in detected values if they are not defined via qemu command line */ if (!conf->physical_block_size) { - if (!backend_ret) { + if (use_blocksizes) { conf->physical_block_size = blocksizes.phys; } else { conf->physical_block_size = BDRV_SECTOR_SIZE; } } if (!conf->logical_block_size) { - if (!backend_ret) { + if (use_blocksizes) { conf->logical_block_size = blocksizes.log; } else { conf->logical_block_size = BDRV_SECTOR_SIZE; } } + if (use_bs) { + if (!conf->opt_io_size) { + conf->opt_io_size = bs->bl.opt_transfer; + } + if (conf->discard_granularity == -1) { + if (bs->bl.pdiscard_alignment) { + conf->discard_granularity = bs->bl.pdiscard_alignment; + } else if (bs->bl.request_alignment != 1) { + conf->discard_granularity = bs->bl.request_alignment; + } + } + } if (conf->logical_block_size > conf->physical_block_size) { error_setg(errp, diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index cd81893..252c3a7 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -198,6 +198,10 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) goto fail_guest_notifiers; } + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); /* Set up virtqueue notify */ @@ -211,6 +215,10 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); while (j--) { @@ -330,12 +338,20 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev) aio_context_release(s->ctx); + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); for (i = 0; i < nvqs; i++) { virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); for (i = 0; i < nvqs; i++) { diff --git a/hw/hyperv/vmbus.c b/hw/hyperv/vmbus.c index 984caf8..c9887d5 100644 --- a/hw/hyperv/vmbus.c +++ b/hw/hyperv/vmbus.c @@ -2372,6 +2372,14 @@ static void vmbus_dev_realize(DeviceState *dev, Error **errp) assert(!qemu_uuid_is_null(&vdev->instanceid)); + if (!qemu_uuid_is_null(&vdc->instanceid)) { + /* Class wants to only have a single instance with a fixed UUID */ + if (!qemu_uuid_is_equal(&vdev->instanceid, &vdc->instanceid)) { + error_setg(&err, "instance id can't be changed"); + goto error_out; + } + } + /* Check for instance id collision for this class id */ QTAILQ_FOREACH(child, &BUS(vmbus)->children, sibling) { VMBusDevice *child_dev = VMBUS_DEVICE(child->child); @@ -2438,18 +2446,22 @@ static void vmbus_dev_unrealize(DeviceState *dev) free_channels(vdev); } +static Property vmbus_dev_props[] = { + DEFINE_PROP_UUID("instanceid", VMBusDevice, instanceid), + DEFINE_PROP_END_OF_LIST() +}; + + static void vmbus_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *kdev = DEVICE_CLASS(klass); + device_class_set_props(kdev, vmbus_dev_props); kdev->bus_type = TYPE_VMBUS; kdev->realize = vmbus_dev_realize; kdev->unrealize = vmbus_dev_unrealize; kdev->reset = vmbus_dev_reset; } -static Property vmbus_dev_instanceid = - DEFINE_PROP_UUID("instanceid", VMBusDevice, instanceid); - static void vmbus_dev_instance_init(Object *obj) { VMBusDevice *vdev = VMBUS_DEVICE(obj); @@ -2458,8 +2470,6 @@ static void vmbus_dev_instance_init(Object *obj) if (!qemu_uuid_is_null(&vdc->instanceid)) { /* Class wants to only have a single instance with a fixed UUID */ vdev->instanceid = vdc->instanceid; - } else { - qdev_property_add_static(DEVICE(vdev), &vmbus_dev_instanceid); } } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 796ffc6..357437f 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -435,11 +435,15 @@ static void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus, aml_append(dev, aml_name_decl("_ADR", aml_int(slot << 16))); if (bsel) { - aml_append(dev, aml_name_decl("_SUN", aml_int(slot))); + /* + * Can't declare _SUN here for every device as it changes 'slot' + * enumeration order in linux kernel, so use another variable for it + */ + aml_append(dev, aml_name_decl("ASUN", aml_int(slot))); method = aml_method("_DSM", 4, AML_SERIALIZED); aml_append(method, aml_return( aml_call6("PDSM", aml_arg(0), aml_arg(1), aml_arg(2), - aml_arg(3), aml_name("BSEL"), aml_name("_SUN")) + aml_arg(3), aml_name("BSEL"), aml_name("ASUN")) )); aml_append(dev, method); } @@ -466,6 +470,7 @@ static void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus, aml_append(method, aml_return(aml_int(s3d))); aml_append(dev, method); } else if (hotplug_enabled_dev) { + aml_append(dev, aml_name_decl("_SUN", aml_int(slot))); /* add _EJ0 to make slot hotpluggable */ method = aml_method("_EJ0", 1, AML_NOTSERIALIZED); aml_append(method, diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index bd7958b..16d20cd 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -3234,6 +3234,7 @@ static bool failover_replug_primary(VirtIONet *n, DeviceState *dev, } hotplug_handler_plug(hotplug_ctrl, dev, &err); } + pdev->partially_hotplugged = false; out: error_propagate(errp, err); diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index 2eb729d..0f37cf0 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -29,6 +29,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "hw/i386/pc.h" #include "hw/pci-host/q35.h" #include "hw/qdev-properties.h" @@ -318,6 +319,8 @@ static void mch_update_pciexbar(MCHPCIState *mch) addr_mask |= MCH_HOST_BRIDGE_PCIEXBAR_64ADMSK; break; case MCH_HOST_BRIDGE_PCIEXBAR_LENGTH_RVD: + qemu_log_mask(LOG_GUEST_ERROR, "Q35: Reserved PCIEXBAR LENGTH\n"); + return; default: abort(); } diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig index 66e0b15..7fcafec 100644 --- a/hw/ppc/Kconfig +++ b/hw/ppc/Kconfig @@ -13,6 +13,7 @@ config PSERIES select MSI_NONBROKEN select FDT_PPC select CHRP_NVRAM + select VOF config SPAPR_RNG bool @@ -75,6 +76,7 @@ config PEGASOS2 select VT82C686 select IDE_VIA select SMBUS_EEPROM + select VOF # This should come with VT82C686 select ACPI_X86 @@ -144,3 +146,6 @@ config FW_CFG_PPC config FDT_PPC bool + +config VOF + bool diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build index 597d974..aa4c8e6 100644 --- a/hw/ppc/meson.build +++ b/hw/ppc/meson.build @@ -84,4 +84,7 @@ ppc_ss.add(when: 'CONFIG_VIRTEX', if_true: files('virtex_ml507.c')) # Pegasos2 ppc_ss.add(when: 'CONFIG_PEGASOS2', if_true: files('pegasos2.c')) +ppc_ss.add(when: 'CONFIG_VOF', if_true: files('vof.c')) +ppc_ss.add(when: ['CONFIG_VOF', 'CONFIG_PSERIES'], if_true: files('spapr_vof.c')) + hw_arch += {'ppc': ppc_ss} diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c index 0bfd092..9a6ae86 100644 --- a/hw/ppc/pegasos2.c +++ b/hw/ppc/pegasos2.c @@ -1,7 +1,7 @@ /* * QEMU PowerPC CHRP (Genesi/bPlan Pegasos II) hardware System Emulator * - * Copyright (c) 2018-2020 BALATON Zoltan + * Copyright (c) 2018-2021 BALATON Zoltan * * This work is licensed under the GNU GPL license version 2 or later. * @@ -34,26 +34,68 @@ #include "trace.h" #include "qemu/datadir.h" #include "sysemu/device_tree.h" +#include "hw/ppc/vof.h" -#define PROM_FILENAME "pegasos2.rom" +#include <libfdt.h> + +#define PROM_FILENAME "vof.bin" #define PROM_ADDR 0xfff00000 #define PROM_SIZE 0x80000 +#define KVMPPC_HCALL_BASE 0xf000 +#define KVMPPC_H_RTAS (KVMPPC_HCALL_BASE + 0x0) +#define KVMPPC_H_VOF_CLIENT (KVMPPC_HCALL_BASE + 0x5) + +#define H_SUCCESS 0 +#define H_PRIVILEGE -3 /* Caller not privileged */ +#define H_PARAMETER -4 /* Parameter invalid, out-of-range or conflicting */ + #define BUS_FREQ_HZ 133333333 +#define PCI0_MEM_BASE 0xc0000000 +#define PCI0_MEM_SIZE 0x20000000 +#define PCI0_IO_BASE 0xf8000000 +#define PCI0_IO_SIZE 0x10000 + +#define PCI1_MEM_BASE 0x80000000 +#define PCI1_MEM_SIZE 0x40000000 +#define PCI1_IO_BASE 0xfe000000 +#define PCI1_IO_SIZE 0x10000 + +#define TYPE_PEGASOS2_MACHINE MACHINE_TYPE_NAME("pegasos2") +OBJECT_DECLARE_TYPE(Pegasos2MachineState, MachineClass, PEGASOS2_MACHINE) + +struct Pegasos2MachineState { + MachineState parent_obj; + PowerPCCPU *cpu; + DeviceState *mv; + Vof *vof; + void *fdt_blob; + uint64_t kernel_addr; + uint64_t kernel_entry; + uint64_t kernel_size; +}; + +static void *build_fdt(MachineState *machine, int *fdt_size); + static void pegasos2_cpu_reset(void *opaque) { PowerPCCPU *cpu = opaque; + Pegasos2MachineState *pm = PEGASOS2_MACHINE(current_machine); cpu_reset(CPU(cpu)); cpu->env.spr[SPR_HID1] = 7ULL << 28; + if (pm->vof) { + cpu->env.gpr[1] = 2 * VOF_STACK_SIZE - 0x20; + cpu->env.nip = 0x100; + } } static void pegasos2_init(MachineState *machine) { - PowerPCCPU *cpu = NULL; + Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine); + CPUPPCState *env; MemoryRegion *rom = g_new(MemoryRegion, 1); - DeviceState *mv; PCIBus *pci_bus; PCIDevice *dev; I2CBus *i2c_bus; @@ -63,15 +105,16 @@ static void pegasos2_init(MachineState *machine) uint8_t *spd_data; /* init CPU */ - cpu = POWERPC_CPU(cpu_create(machine->cpu_type)); - if (PPC_INPUT(&cpu->env) != PPC_FLAGS_INPUT_6xx) { + pm->cpu = POWERPC_CPU(cpu_create(machine->cpu_type)); + env = &pm->cpu->env; + if (PPC_INPUT(env) != PPC_FLAGS_INPUT_6xx) { error_report("Incompatible CPU, only 6xx bus supported"); exit(1); } /* Set time-base frequency */ - cpu_ppc_tb_init(&cpu->env, BUS_FREQ_HZ / 4); - qemu_register_reset(pegasos2_cpu_reset, cpu); + cpu_ppc_tb_init(env, BUS_FREQ_HZ / 4); + qemu_register_reset(pegasos2_cpu_reset, pm->cpu); /* RAM */ memory_region_add_subregion(get_system_memory(), 0, machine->ram); @@ -82,30 +125,36 @@ static void pegasos2_init(MachineState *machine) error_report("Could not find firmware '%s'", fwname); exit(1); } + if (!machine->firmware && !pm->vof) { + pm->vof = g_malloc0(sizeof(*pm->vof)); + } memory_region_init_rom(rom, NULL, "pegasos2.rom", PROM_SIZE, &error_fatal); memory_region_add_subregion(get_system_memory(), PROM_ADDR, rom); sz = load_elf(filename, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1, PPC_ELF_MACHINE, 0, 0); if (sz <= 0) { - sz = load_image_targphys(filename, PROM_ADDR, PROM_SIZE); + sz = load_image_targphys(filename, pm->vof ? 0 : PROM_ADDR, PROM_SIZE); } if (sz <= 0 || sz > PROM_SIZE) { error_report("Could not load firmware '%s'", filename); exit(1); } g_free(filename); + if (pm->vof) { + pm->vof->fw_size = sz; + } /* Marvell Discovery II system controller */ - mv = DEVICE(sysbus_create_simple(TYPE_MV64361, -1, - ((qemu_irq *)cpu->env.irq_inputs)[PPC6xx_INPUT_INT])); - pci_bus = mv64361_get_pci_bus(mv, 1); + pm->mv = DEVICE(sysbus_create_simple(TYPE_MV64361, -1, + ((qemu_irq *)env->irq_inputs)[PPC6xx_INPUT_INT])); + pci_bus = mv64361_get_pci_bus(pm->mv, 1); /* VIA VT8231 South Bridge (multifunction PCI device) */ /* VT8231 function 0: PCI-to-ISA Bridge */ dev = pci_create_simple_multifunction(pci_bus, PCI_DEVFN(12, 0), true, TYPE_VT8231_ISA); qdev_connect_gpio_out(DEVICE(dev), 0, - qdev_get_gpio_in_named(mv, "gpp", 31)); + qdev_get_gpio_in_named(pm->mv, "gpp", 31)); /* VT8231 function 1: IDE Controller */ dev = pci_create_simple(pci_bus, PCI_DEVFN(12, 1), "via-ide"); @@ -127,18 +176,728 @@ static void pegasos2_init(MachineState *machine) /* other PC hardware */ pci_vga_init(pci_bus); + + if (machine->kernel_filename) { + sz = load_elf(machine->kernel_filename, NULL, NULL, NULL, + &pm->kernel_entry, &pm->kernel_addr, NULL, NULL, 1, + PPC_ELF_MACHINE, 0, 0); + if (sz <= 0) { + error_report("Could not load kernel '%s'", + machine->kernel_filename); + exit(1); + } + pm->kernel_size = sz; + if (!pm->vof) { + warn_report("Option -kernel may be ineffective with -bios."); + } + } + if (machine->kernel_cmdline && !pm->vof) { + warn_report("Option -append may be ineffective with -bios."); + } +} + +static uint32_t pegasos2_pci_config_read(AddressSpace *as, int bus, + uint32_t addr, uint32_t len) +{ + hwaddr pcicfg = (bus ? 0xf1000c78 : 0xf1000cf8); + uint32_t val = 0xffffffff; + + stl_le_phys(as, pcicfg, addr | BIT(31)); + switch (len) { + case 4: + val = ldl_le_phys(as, pcicfg + 4); + break; + case 2: + val = lduw_le_phys(as, pcicfg + 4); + break; + case 1: + val = ldub_phys(as, pcicfg + 4); + break; + default: + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid length\n", __func__); + break; + } + return val; +} + +static void pegasos2_pci_config_write(AddressSpace *as, int bus, uint32_t addr, + uint32_t len, uint32_t val) +{ + hwaddr pcicfg = (bus ? 0xf1000c78 : 0xf1000cf8); + + stl_le_phys(as, pcicfg, addr | BIT(31)); + switch (len) { + case 4: + stl_le_phys(as, pcicfg + 4, val); + break; + case 2: + stw_le_phys(as, pcicfg + 4, val); + break; + case 1: + stb_phys(as, pcicfg + 4, val); + break; + default: + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid length\n", __func__); + break; + } +} + +static void pegasos2_machine_reset(MachineState *machine) +{ + Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine); + AddressSpace *as = CPU(pm->cpu)->as; + void *fdt; + uint64_t d[2]; + int sz; + + qemu_devices_reset(); + if (!pm->vof) { + return; /* Firmware should set up machine so nothing to do */ + } + + /* Otherwise, set up devices that board firmware would normally do */ + stl_le_phys(as, 0xf1000000, 0x28020ff); + stl_le_phys(as, 0xf1000278, 0xa31fc); + stl_le_phys(as, 0xf100f300, 0x11ff0400); + stl_le_phys(as, 0xf100f10c, 0x80000000); + stl_le_phys(as, 0xf100001c, 0x8000000); + pegasos2_pci_config_write(as, 0, PCI_COMMAND, 2, PCI_COMMAND_IO | + PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); + pegasos2_pci_config_write(as, 1, PCI_COMMAND, 2, PCI_COMMAND_IO | + PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 0) << 8) | + PCI_INTERRUPT_LINE, 2, 0x9); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 0) << 8) | + 0x50, 1, 0x2); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + PCI_INTERRUPT_LINE, 2, 0x109); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + PCI_CLASS_PROG, 1, 0xf); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + 0x40, 1, 0xb); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + 0x50, 4, 0x17171717); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + PCI_COMMAND, 2, 0x87); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 2) << 8) | + PCI_INTERRUPT_LINE, 2, 0x409); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 3) << 8) | + PCI_INTERRUPT_LINE, 2, 0x409); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 4) << 8) | + PCI_INTERRUPT_LINE, 2, 0x9); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 4) << 8) | + 0x48, 4, 0xf00); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 4) << 8) | + 0x40, 4, 0x558020); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 4) << 8) | + 0x90, 4, 0xd00); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 5) << 8) | + PCI_INTERRUPT_LINE, 2, 0x309); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 6) << 8) | + PCI_INTERRUPT_LINE, 2, 0x309); + + /* Device tree and VOF set up */ + vof_init(pm->vof, machine->ram_size, &error_fatal); + if (vof_claim(pm->vof, 0, VOF_STACK_SIZE, VOF_STACK_SIZE) == -1) { + error_report("Memory allocation for stack failed"); + exit(1); + } + if (pm->kernel_size && + vof_claim(pm->vof, pm->kernel_addr, pm->kernel_size, 0) == -1) { + error_report("Memory for kernel is in use"); + exit(1); + } + fdt = build_fdt(machine, &sz); + /* FIXME: VOF assumes entry is same as load address */ + d[0] = cpu_to_be64(pm->kernel_entry); + d[1] = cpu_to_be64(pm->kernel_size - (pm->kernel_entry - pm->kernel_addr)); + qemu_fdt_setprop(fdt, "/chosen", "qemu,boot-kernel", d, sizeof(d)); + + qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt)); + g_free(pm->fdt_blob); + pm->fdt_blob = fdt; + + vof_build_dt(fdt, pm->vof); + vof_client_open_store(fdt, pm->vof, "/chosen", "stdout", "/failsafe"); + pm->cpu->vhyp = PPC_VIRTUAL_HYPERVISOR(machine); } -static void pegasos2_machine(MachineClass *mc) +enum pegasos2_rtas_tokens { + RTAS_RESTART_RTAS = 0, + RTAS_NVRAM_FETCH = 1, + RTAS_NVRAM_STORE = 2, + RTAS_GET_TIME_OF_DAY = 3, + RTAS_SET_TIME_OF_DAY = 4, + RTAS_EVENT_SCAN = 6, + RTAS_CHECK_EXCEPTION = 7, + RTAS_READ_PCI_CONFIG = 8, + RTAS_WRITE_PCI_CONFIG = 9, + RTAS_DISPLAY_CHARACTER = 10, + RTAS_SET_INDICATOR = 11, + RTAS_POWER_OFF = 17, + RTAS_SUSPEND = 18, + RTAS_HIBERNATE = 19, + RTAS_SYSTEM_REBOOT = 20, +}; + +static target_ulong pegasos2_rtas(PowerPCCPU *cpu, Pegasos2MachineState *pm, + target_ulong args_real) { + AddressSpace *as = CPU(cpu)->as; + uint32_t token = ldl_be_phys(as, args_real); + uint32_t nargs = ldl_be_phys(as, args_real + 4); + uint32_t nrets = ldl_be_phys(as, args_real + 8); + uint32_t args = args_real + 12; + uint32_t rets = args_real + 12 + nargs * 4; + + if (nrets < 1) { + qemu_log_mask(LOG_GUEST_ERROR, "Too few return values in RTAS call\n"); + return H_PARAMETER; + } + switch (token) { + case RTAS_READ_PCI_CONFIG: + { + uint32_t addr, len, val; + + if (nargs != 2 || nrets != 2) { + stl_be_phys(as, rets, -1); + return H_PARAMETER; + } + addr = ldl_be_phys(as, args); + len = ldl_be_phys(as, args + 4); + val = pegasos2_pci_config_read(as, !(addr >> 24), + addr & 0x0fffffff, len); + stl_be_phys(as, rets, 0); + stl_be_phys(as, rets + 4, val); + return H_SUCCESS; + } + case RTAS_WRITE_PCI_CONFIG: + { + uint32_t addr, len, val; + + if (nargs != 3 || nrets != 1) { + stl_be_phys(as, rets, -1); + return H_PARAMETER; + } + addr = ldl_be_phys(as, args); + len = ldl_be_phys(as, args + 4); + val = ldl_be_phys(as, args + 8); + pegasos2_pci_config_write(as, !(addr >> 24), + addr & 0x0fffffff, len, val); + stl_be_phys(as, rets, 0); + return H_SUCCESS; + } + case RTAS_DISPLAY_CHARACTER: + if (nargs != 1 || nrets != 1) { + stl_be_phys(as, rets, -1); + return H_PARAMETER; + } + qemu_log_mask(LOG_UNIMP, "%c", ldl_be_phys(as, args)); + stl_be_phys(as, rets, 0); + return H_SUCCESS; + default: + qemu_log_mask(LOG_UNIMP, "Unknown RTAS token %u (args=%u, rets=%u)\n", + token, nargs, nrets); + stl_be_phys(as, rets, 0); + return H_SUCCESS; + } +} + +static void pegasos2_hypercall(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu) +{ + Pegasos2MachineState *pm = PEGASOS2_MACHINE(vhyp); + CPUPPCState *env = &cpu->env; + + /* The TCG path should also be holding the BQL at this point */ + g_assert(qemu_mutex_iothread_locked()); + + if (msr_pr) { + qemu_log_mask(LOG_GUEST_ERROR, "Hypercall made with MSR[PR]=1\n"); + env->gpr[3] = H_PRIVILEGE; + } else if (env->gpr[3] == KVMPPC_H_RTAS) { + env->gpr[3] = pegasos2_rtas(cpu, pm, env->gpr[4]); + } else if (env->gpr[3] == KVMPPC_H_VOF_CLIENT) { + int ret = vof_client_call(MACHINE(pm), pm->vof, pm->fdt_blob, + env->gpr[4]); + env->gpr[3] = (ret ? H_PARAMETER : H_SUCCESS); + } else { + qemu_log_mask(LOG_GUEST_ERROR, "Unsupported hypercall " TARGET_FMT_lx + "\n", env->gpr[3]); + env->gpr[3] = -1; + } +} + +static void vhyp_nop(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu) +{ +} + +static target_ulong vhyp_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp) +{ + return POWERPC_CPU(current_cpu)->env.spr[SPR_SDR1]; +} + +static void pegasos2_machine_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc); + mc->desc = "Genesi/bPlan Pegasos II"; mc->init = pegasos2_init; + mc->reset = pegasos2_machine_reset; mc->block_default_type = IF_IDE; mc->default_boot_order = "cd"; mc->default_display = "std"; mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("7400_v2.9"); mc->default_ram_id = "pegasos2.ram"; mc->default_ram_size = 512 * MiB; + + vhc->hypercall = pegasos2_hypercall; + vhc->cpu_exec_enter = vhyp_nop; + vhc->cpu_exec_exit = vhyp_nop; + vhc->encode_hpt_for_kvm_pr = vhyp_encode_hpt_for_kvm_pr; +} + +static const TypeInfo pegasos2_machine_info = { + .name = TYPE_PEGASOS2_MACHINE, + .parent = TYPE_MACHINE, + .class_init = pegasos2_machine_class_init, + .instance_size = sizeof(Pegasos2MachineState), + .interfaces = (InterfaceInfo[]) { + { TYPE_PPC_VIRTUAL_HYPERVISOR }, + { } + }, +}; + +static void pegasos2_machine_register_types(void) +{ + type_register_static(&pegasos2_machine_info); +} + +type_init(pegasos2_machine_register_types) + +/* FDT creation for passing to firmware */ + +typedef struct { + void *fdt; + const char *path; +} FDTInfo; + +/* We do everything in reverse order so it comes out right in the tree */ + +static void dt_ide(PCIBus *bus, PCIDevice *d, FDTInfo *fi) +{ + qemu_fdt_setprop_string(fi->fdt, fi->path, "device_type", "spi"); } -DEFINE_MACHINE("pegasos2", pegasos2_machine) +static void dt_usb(PCIBus *bus, PCIDevice *d, FDTInfo *fi) +{ + qemu_fdt_setprop_cell(fi->fdt, fi->path, "#size-cells", 0); + qemu_fdt_setprop_cell(fi->fdt, fi->path, "#address-cells", 1); + qemu_fdt_setprop_string(fi->fdt, fi->path, "device_type", "usb"); +} + +static void dt_isa(PCIBus *bus, PCIDevice *d, FDTInfo *fi) +{ + GString *name = g_string_sized_new(64); + uint32_t cells[3]; + + qemu_fdt_setprop_cell(fi->fdt, fi->path, "#size-cells", 1); + qemu_fdt_setprop_cell(fi->fdt, fi->path, "#address-cells", 2); + qemu_fdt_setprop_string(fi->fdt, fi->path, "device_type", "isa"); + qemu_fdt_setprop_string(fi->fdt, fi->path, "name", "isa"); + + /* addional devices */ + g_string_printf(name, "%s/lpt@i3bc", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(7); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x3bc); + cells[2] = cpu_to_be32(8); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "lpt"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "lpt"); + + g_string_printf(name, "%s/fdc@i3f0", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(6); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x3f0); + cells[2] = cpu_to_be32(8); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "fdc"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "fdc"); + + g_string_printf(name, "%s/timer@i40", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x40); + cells[2] = cpu_to_be32(8); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "timer"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "timer"); + + g_string_printf(name, "%s/rtc@i70", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_string(fi->fdt, name->str, "compatible", "ds1385-rtc"); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(8); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x70); + cells[2] = cpu_to_be32(2); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "rtc"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "rtc"); + + g_string_printf(name, "%s/keyboard@i60", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + cells[0] = cpu_to_be32(1); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x60); + cells[2] = cpu_to_be32(5); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "keyboard"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "keyboard"); + + g_string_printf(name, "%s/8042@i60", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "#interrupt-cells", 2); + qemu_fdt_setprop_cell(fi->fdt, name->str, "#size-cells", 0); + qemu_fdt_setprop_cell(fi->fdt, name->str, "#address-cells", 1); + qemu_fdt_setprop_string(fi->fdt, name->str, "interrupt-controller", ""); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x60); + cells[2] = cpu_to_be32(5); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", ""); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "8042"); + + g_string_printf(name, "%s/serial@i2f8", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(3); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x2f8); + cells[2] = cpu_to_be32(8); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "serial"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "serial"); + + g_string_free(name, TRUE); +} + +static struct { + const char *id; + const char *name; + void (*dtf)(PCIBus *bus, PCIDevice *d, FDTInfo *fi); +} device_map[] = { + { "pci11ab,6460", "host", NULL }, + { "pci1106,8231", "isa", dt_isa }, + { "pci1106,571", "ide", dt_ide }, + { "pci1106,3044", "firewire", NULL }, + { "pci1106,3038", "usb", dt_usb }, + { "pci1106,8235", "other", NULL }, + { "pci1106,3058", "sound", NULL }, + { NULL, NULL } +}; + +static void add_pci_device(PCIBus *bus, PCIDevice *d, void *opaque) +{ + FDTInfo *fi = opaque; + GString *node = g_string_new(NULL); + uint32_t cells[(PCI_NUM_REGIONS + 1) * 5]; + int i, j; + const char *name = NULL; + g_autofree const gchar *pn = g_strdup_printf("pci%x,%x", + pci_get_word(&d->config[PCI_VENDOR_ID]), + pci_get_word(&d->config[PCI_DEVICE_ID])); + + for (i = 0; device_map[i].id; i++) { + if (!strcmp(pn, device_map[i].id)) { + name = device_map[i].name; + break; + } + } + g_string_printf(node, "%s/%s@%x", fi->path, (name ?: pn), + PCI_SLOT(d->devfn)); + if (PCI_FUNC(d->devfn)) { + g_string_append_printf(node, ",%x", PCI_FUNC(d->devfn)); + } + + qemu_fdt_add_subnode(fi->fdt, node->str); + if (device_map[i].dtf) { + FDTInfo cfi = { fi->fdt, node->str }; + device_map[i].dtf(bus, d, &cfi); + } + cells[0] = cpu_to_be32(d->devfn << 8); + cells[1] = 0; + cells[2] = 0; + cells[3] = 0; + cells[4] = 0; + j = 5; + for (i = 0; i < PCI_NUM_REGIONS; i++) { + if (!d->io_regions[i].size) { + continue; + } + cells[j] = cpu_to_be32(d->devfn << 8 | (PCI_BASE_ADDRESS_0 + i * 4)); + if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) { + cells[j] |= cpu_to_be32(1 << 24); + } else { + cells[j] |= cpu_to_be32(2 << 24); + if (d->io_regions[i].type & PCI_BASE_ADDRESS_MEM_PREFETCH) { + cells[j] |= cpu_to_be32(4 << 28); + } + } + cells[j + 1] = 0; + cells[j + 2] = 0; + cells[j + 3] = cpu_to_be32(d->io_regions[i].size >> 32); + cells[j + 4] = cpu_to_be32(d->io_regions[i].size); + j += 5; + } + qemu_fdt_setprop(fi->fdt, node->str, "reg", cells, j * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, node->str, "name", name ?: pn); + if (pci_get_byte(&d->config[PCI_INTERRUPT_PIN])) { + qemu_fdt_setprop_cell(fi->fdt, node->str, "interrupts", + pci_get_byte(&d->config[PCI_INTERRUPT_PIN])); + } + /* Pegasos2 firmware has subsystem-id amd subsystem-vendor-id swapped */ + qemu_fdt_setprop_cell(fi->fdt, node->str, "subsystem-vendor-id", + pci_get_word(&d->config[PCI_SUBSYSTEM_ID])); + qemu_fdt_setprop_cell(fi->fdt, node->str, "subsystem-id", + pci_get_word(&d->config[PCI_SUBSYSTEM_VENDOR_ID])); + cells[0] = pci_get_long(&d->config[PCI_CLASS_REVISION]); + qemu_fdt_setprop_cell(fi->fdt, node->str, "class-code", cells[0] >> 8); + qemu_fdt_setprop_cell(fi->fdt, node->str, "revision-id", cells[0] & 0xff); + qemu_fdt_setprop_cell(fi->fdt, node->str, "device-id", + pci_get_word(&d->config[PCI_DEVICE_ID])); + qemu_fdt_setprop_cell(fi->fdt, node->str, "vendor-id", + pci_get_word(&d->config[PCI_VENDOR_ID])); + + g_string_free(node, TRUE); +} + +static void *build_fdt(MachineState *machine, int *fdt_size) +{ + Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine); + PowerPCCPU *cpu = pm->cpu; + PCIBus *pci_bus; + FDTInfo fi; + uint32_t cells[16]; + void *fdt = create_device_tree(fdt_size); + + fi.fdt = fdt; + + /* root node */ + qemu_fdt_setprop_string(fdt, "/", "CODEGEN,description", + "Pegasos CHRP PowerPC System"); + qemu_fdt_setprop_string(fdt, "/", "CODEGEN,board", "Pegasos2"); + qemu_fdt_setprop_string(fdt, "/", "CODEGEN,vendor", "bplan GmbH"); + qemu_fdt_setprop_string(fdt, "/", "revision", "2B"); + qemu_fdt_setprop_string(fdt, "/", "model", "Pegasos2"); + qemu_fdt_setprop_string(fdt, "/", "device_type", "chrp"); + qemu_fdt_setprop_cell(fdt, "/", "#address-cells", 1); + qemu_fdt_setprop_string(fdt, "/", "name", "bplan,Pegasos2"); + + /* pci@c0000000 */ + qemu_fdt_add_subnode(fdt, "/pci@c0000000"); + cells[0] = 0; + cells[1] = 0; + qemu_fdt_setprop(fdt, "/pci@c0000000", "bus-range", + cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@c0000000", "pci-bridge-number", 1); + cells[0] = cpu_to_be32(PCI0_MEM_BASE); + cells[1] = cpu_to_be32(PCI0_MEM_SIZE); + qemu_fdt_setprop(fdt, "/pci@c0000000", "reg", cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(0x01000000); + cells[1] = 0; + cells[2] = 0; + cells[3] = cpu_to_be32(PCI0_IO_BASE); + cells[4] = 0; + cells[5] = cpu_to_be32(PCI0_IO_SIZE); + cells[6] = cpu_to_be32(0x02000000); + cells[7] = 0; + cells[8] = cpu_to_be32(PCI0_MEM_BASE); + cells[9] = cpu_to_be32(PCI0_MEM_BASE); + cells[10] = 0; + cells[11] = cpu_to_be32(PCI0_MEM_SIZE); + qemu_fdt_setprop(fdt, "/pci@c0000000", "ranges", + cells, 12 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@c0000000", "#size-cells", 2); + qemu_fdt_setprop_cell(fdt, "/pci@c0000000", "#address-cells", 3); + qemu_fdt_setprop_string(fdt, "/pci@c0000000", "device_type", "pci"); + qemu_fdt_setprop_string(fdt, "/pci@c0000000", "name", "pci"); + + fi.path = "/pci@c0000000"; + pci_bus = mv64361_get_pci_bus(pm->mv, 0); + pci_for_each_device_reverse(pci_bus, 0, add_pci_device, &fi); + + /* pci@80000000 */ + qemu_fdt_add_subnode(fdt, "/pci@80000000"); + cells[0] = 0; + cells[1] = 0; + qemu_fdt_setprop(fdt, "/pci@80000000", "bus-range", + cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@80000000", "pci-bridge-number", 0); + cells[0] = cpu_to_be32(PCI1_MEM_BASE); + cells[1] = cpu_to_be32(PCI1_MEM_SIZE); + qemu_fdt_setprop(fdt, "/pci@80000000", "reg", cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@80000000", "8259-interrupt-acknowledge", + 0xf1000cb4); + cells[0] = cpu_to_be32(0x01000000); + cells[1] = 0; + cells[2] = 0; + cells[3] = cpu_to_be32(PCI1_IO_BASE); + cells[4] = 0; + cells[5] = cpu_to_be32(PCI1_IO_SIZE); + cells[6] = cpu_to_be32(0x02000000); + cells[7] = 0; + cells[8] = cpu_to_be32(PCI1_MEM_BASE); + cells[9] = cpu_to_be32(PCI1_MEM_BASE); + cells[10] = 0; + cells[11] = cpu_to_be32(PCI1_MEM_SIZE); + qemu_fdt_setprop(fdt, "/pci@80000000", "ranges", + cells, 12 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@80000000", "#size-cells", 2); + qemu_fdt_setprop_cell(fdt, "/pci@80000000", "#address-cells", 3); + qemu_fdt_setprop_string(fdt, "/pci@80000000", "device_type", "pci"); + qemu_fdt_setprop_string(fdt, "/pci@80000000", "name", "pci"); + + fi.path = "/pci@80000000"; + pci_bus = mv64361_get_pci_bus(pm->mv, 1); + pci_for_each_device_reverse(pci_bus, 0, add_pci_device, &fi); + + qemu_fdt_add_subnode(fdt, "/failsafe"); + qemu_fdt_setprop_string(fdt, "/failsafe", "device_type", "serial"); + qemu_fdt_setprop_string(fdt, "/failsafe", "name", "failsafe"); + + qemu_fdt_add_subnode(fdt, "/rtas"); + qemu_fdt_setprop_cell(fdt, "/rtas", "system-reboot", RTAS_SYSTEM_REBOOT); + qemu_fdt_setprop_cell(fdt, "/rtas", "hibernate", RTAS_HIBERNATE); + qemu_fdt_setprop_cell(fdt, "/rtas", "suspend", RTAS_SUSPEND); + qemu_fdt_setprop_cell(fdt, "/rtas", "power-off", RTAS_POWER_OFF); + qemu_fdt_setprop_cell(fdt, "/rtas", "set-indicator", RTAS_SET_INDICATOR); + qemu_fdt_setprop_cell(fdt, "/rtas", "display-character", + RTAS_DISPLAY_CHARACTER); + qemu_fdt_setprop_cell(fdt, "/rtas", "write-pci-config", + RTAS_WRITE_PCI_CONFIG); + qemu_fdt_setprop_cell(fdt, "/rtas", "read-pci-config", + RTAS_READ_PCI_CONFIG); + /* Pegasos2 firmware misspells check-exception and guests use that */ + qemu_fdt_setprop_cell(fdt, "/rtas", "check-execption", + RTAS_CHECK_EXCEPTION); + qemu_fdt_setprop_cell(fdt, "/rtas", "event-scan", RTAS_EVENT_SCAN); + qemu_fdt_setprop_cell(fdt, "/rtas", "set-time-of-day", + RTAS_SET_TIME_OF_DAY); + qemu_fdt_setprop_cell(fdt, "/rtas", "get-time-of-day", + RTAS_GET_TIME_OF_DAY); + qemu_fdt_setprop_cell(fdt, "/rtas", "nvram-store", RTAS_NVRAM_STORE); + qemu_fdt_setprop_cell(fdt, "/rtas", "nvram-fetch", RTAS_NVRAM_FETCH); + qemu_fdt_setprop_cell(fdt, "/rtas", "restart-rtas", RTAS_RESTART_RTAS); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-error-log-max", 0); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-event-scan-rate", 0); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-display-device", 0); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-version", 1); + + /* cpus */ + qemu_fdt_add_subnode(fdt, "/cpus"); + qemu_fdt_setprop_cell(fdt, "/cpus", "#cpus", 1); + qemu_fdt_setprop_cell(fdt, "/cpus", "#address-cells", 1); + qemu_fdt_setprop_cell(fdt, "/cpus", "#size-cells", 0); + qemu_fdt_setprop_string(fdt, "/cpus", "name", "cpus"); + + /* FIXME Get CPU name from CPU object */ + const char *cp = "/cpus/PowerPC,G4"; + qemu_fdt_add_subnode(fdt, cp); + qemu_fdt_setprop_cell(fdt, cp, "l2cr", 0); + qemu_fdt_setprop_cell(fdt, cp, "d-cache-size", 0x8000); + qemu_fdt_setprop_cell(fdt, cp, "d-cache-block-size", + cpu->env.dcache_line_size); + qemu_fdt_setprop_cell(fdt, cp, "d-cache-line-size", + cpu->env.dcache_line_size); + qemu_fdt_setprop_cell(fdt, cp, "i-cache-size", 0x8000); + qemu_fdt_setprop_cell(fdt, cp, "i-cache-block-size", + cpu->env.icache_line_size); + qemu_fdt_setprop_cell(fdt, cp, "i-cache-line-size", + cpu->env.icache_line_size); + if (cpu->env.id_tlbs) { + qemu_fdt_setprop_cell(fdt, cp, "i-tlb-sets", cpu->env.nb_ways); + qemu_fdt_setprop_cell(fdt, cp, "i-tlb-size", cpu->env.tlb_per_way); + qemu_fdt_setprop_cell(fdt, cp, "d-tlb-sets", cpu->env.nb_ways); + qemu_fdt_setprop_cell(fdt, cp, "d-tlb-size", cpu->env.tlb_per_way); + qemu_fdt_setprop_string(fdt, cp, "tlb-split", ""); + } + qemu_fdt_setprop_cell(fdt, cp, "tlb-sets", cpu->env.nb_ways); + qemu_fdt_setprop_cell(fdt, cp, "tlb-size", cpu->env.nb_tlb); + qemu_fdt_setprop_string(fdt, cp, "state", "running"); + if (cpu->env.insns_flags & PPC_ALTIVEC) { + qemu_fdt_setprop_string(fdt, cp, "altivec", ""); + qemu_fdt_setprop_string(fdt, cp, "data-streams", ""); + } + /* + * FIXME What flags do data-streams, external-control and + * performance-monitor depend on? + */ + qemu_fdt_setprop_string(fdt, cp, "external-control", ""); + if (cpu->env.insns_flags & PPC_FLOAT_FSQRT) { + qemu_fdt_setprop_string(fdt, cp, "general-purpose", ""); + } + qemu_fdt_setprop_string(fdt, cp, "performance-monitor", ""); + if (cpu->env.insns_flags & PPC_FLOAT_FRES) { + qemu_fdt_setprop_string(fdt, cp, "graphics", ""); + } + qemu_fdt_setprop_cell(fdt, cp, "reservation-granule-size", 4); + qemu_fdt_setprop_cell(fdt, cp, "timebase-frequency", + cpu->env.tb_env->tb_freq); + qemu_fdt_setprop_cell(fdt, cp, "bus-frequency", BUS_FREQ_HZ); + qemu_fdt_setprop_cell(fdt, cp, "clock-frequency", BUS_FREQ_HZ * 7.5); + qemu_fdt_setprop_cell(fdt, cp, "cpu-version", cpu->env.spr[SPR_PVR]); + cells[0] = 0; + cells[1] = 0; + qemu_fdt_setprop(fdt, cp, "reg", cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_string(fdt, cp, "device_type", "cpu"); + qemu_fdt_setprop_string(fdt, cp, "name", strrchr(cp, '/') + 1); + + /* memory */ + qemu_fdt_add_subnode(fdt, "/memory@0"); + cells[0] = 0; + cells[1] = cpu_to_be32(machine->ram_size); + qemu_fdt_setprop(fdt, "/memory@0", "reg", cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_string(fdt, "/memory@0", "device_type", "memory"); + qemu_fdt_setprop_string(fdt, "/memory@0", "name", "memory"); + + qemu_fdt_add_subnode(fdt, "/chosen"); + qemu_fdt_setprop_string(fdt, "/chosen", "bootargs", + machine->kernel_cmdline ?: ""); + qemu_fdt_setprop_string(fdt, "/chosen", "name", "chosen"); + + qemu_fdt_add_subnode(fdt, "/openprom"); + qemu_fdt_setprop_string(fdt, "/openprom", "model", "Pegasos2,1.1"); + + return fdt; +} diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 4dd90b7..a007be4 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -101,6 +101,7 @@ #define FDT_MAX_ADDR 0x80000000 /* FDT must stay below that */ #define FW_MAX_SIZE 0x400000 #define FW_FILE_NAME "slof.bin" +#define FW_FILE_NAME_VOF "vof.bin" #define FW_OVERHEAD 0x2800000 #define KERNEL_LOAD_ADDR FW_MAX_SIZE @@ -880,6 +881,10 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) add_str(hypertas, "hcall-copy"); add_str(hypertas, "hcall-debug"); add_str(hypertas, "hcall-vphn"); + if (spapr_get_cap(spapr, SPAPR_CAP_RPT_INVALIDATE) == SPAPR_CAP_ON) { + add_str(hypertas, "hcall-rpt-invalidate"); + } + add_str(qemu_hypertas, "hcall-memop1"); if (!kvm_enabled() || kvmppc_spapr_use_multitce()) { @@ -919,9 +924,13 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) * * The extra 8 bytes is required because Linux's FWNMI error log check * is off-by-one. + * + * RTAS_MIN_SIZE is required for the RTAS blob itself. */ - _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX + - ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t))); + _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_MIN_SIZE + + RTAS_ERROR_LOG_MAX + + ms->smp.max_cpus * sizeof(uint64_t) * 2 + + sizeof(uint64_t))); _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)); _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate", @@ -1639,22 +1648,29 @@ static void spapr_machine_reset(MachineState *machine) fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE; fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE); + if (spapr->vof) { + spapr_vof_reset(spapr, fdt, &error_fatal); + /* + * Do not pack the FDT as the client may change properties. + * VOF client does not expect the FDT so we do not load it to the VM. + */ + } else { + rc = fdt_pack(fdt); + /* Should only fail if we've built a corrupted tree */ + assert(rc == 0); - rc = fdt_pack(fdt); - - /* Should only fail if we've built a corrupted tree */ - assert(rc == 0); - - /* Load the fdt */ + spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, + 0, fdt_addr, 0); + cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt)); + } qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt)); - cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt)); + g_free(spapr->fdt_blob); spapr->fdt_size = fdt_totalsize(fdt); spapr->fdt_initial_size = spapr->fdt_size; spapr->fdt_blob = fdt; /* Set up the entry state */ - spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, fdt_addr, 0); first_ppc_cpu->env.gpr[5] = 0; spapr->fwnmi_system_reset_addr = -1; @@ -2018,6 +2034,7 @@ static const VMStateDescription vmstate_spapr = { &vmstate_spapr_cap_ccf_assist, &vmstate_spapr_cap_fwnmi, &vmstate_spapr_fwnmi, + &vmstate_spapr_cap_rpt_invalidate, NULL } }; @@ -2657,7 +2674,8 @@ static void spapr_machine_init(MachineState *machine) SpaprMachineState *spapr = SPAPR_MACHINE(machine); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); MachineClass *mc = MACHINE_GET_CLASS(machine); - const char *bios_name = machine->firmware ?: FW_FILE_NAME; + const char *bios_default = spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME; + const char *bios_name = machine->firmware ?: bios_default; const char *kernel_filename = machine->kernel_filename; const char *initrd_filename = machine->initrd_filename; PCIHostState *phb; @@ -3014,6 +3032,10 @@ static void spapr_machine_init(MachineState *machine) } qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond); + if (spapr->vof) { + spapr->vof->fw_size = fw_size; /* for claim() on itself */ + spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client); + } } #define DEFAULT_KVM_TYPE "auto" @@ -3204,6 +3226,28 @@ static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp) } } +static bool spapr_get_vof(Object *obj, Error **errp) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(obj); + + return spapr->vof != NULL; +} + +static void spapr_set_vof(Object *obj, bool value, Error **errp) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(obj); + + if (spapr->vof) { + vof_cleanup(spapr->vof); + g_free(spapr->vof); + spapr->vof = NULL; + } + if (!value) { + return; + } + spapr->vof = g_malloc0(sizeof(*spapr->vof)); +} + static char *spapr_get_ic_mode(Object *obj, Error **errp) { SpaprMachineState *spapr = SPAPR_MACHINE(obj); @@ -3329,6 +3373,11 @@ static void spapr_instance_init(Object *obj) stringify(KERNEL_LOAD_ADDR) " for -kernel is the default"); spapr->kernel_addr = KERNEL_LOAD_ADDR; + + object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof); + object_property_set_description(obj, "x-vof", + "Enable Virtual Open Firmware (experimental)"); + /* The machine class defines the default interrupt controller mode */ spapr->irq = smc->irq; object_property_add_str(obj, "ic-mode", spapr_get_ic_mode, @@ -4492,6 +4541,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) XICSFabricClass *xic = XICS_FABRIC_CLASS(oc); InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc); XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc); + VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc); mc->desc = "pSeries Logical Partition (PAPR compliant)"; mc->ignore_boot_device_suffixes = true; @@ -4573,6 +4623,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON; smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON; smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON; + smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF; spapr_caps_add_properties(smc); smc->irq = &spapr_irq_dual; smc->dr_phb_enabled = true; @@ -4580,6 +4631,9 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) smc->smp_threads_vsmt = true; smc->nr_xirqs = SPAPR_NR_XIRQS; xfc->match_nvt = spapr_match_nvt; + vmc->client_architecture_support = spapr_vof_client_architecture_support; + vmc->quiesce = spapr_vof_quiesce; + vmc->setprop = spapr_vof_setprop; } static const TypeInfo spapr_machine_info = { @@ -4599,6 +4653,7 @@ static const TypeInfo spapr_machine_info = { { TYPE_XICS_FABRIC }, { TYPE_INTERRUPT_STATS_PROVIDER }, { TYPE_XIVE_FABRIC }, + { TYPE_VOF_MACHINE_IF }, { } }, }; diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index d0c419b..ed7c077 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -582,6 +582,37 @@ static void cap_fwnmi_apply(SpaprMachineState *spapr, uint8_t val, } } +static void cap_rpt_invalidate_apply(SpaprMachineState *spapr, + uint8_t val, Error **errp) +{ + ERRP_GUARD(); + + if (!val) { + /* capability disabled by default */ + return; + } + + if (tcg_enabled()) { + error_setg(errp, "No H_RPT_INVALIDATE support in TCG"); + error_append_hint(errp, + "Try appending -machine cap-rpt-invalidate=off\n"); + } else if (kvm_enabled()) { + if (!kvmppc_has_cap_mmu_radix()) { + error_setg(errp, "H_RPT_INVALIDATE only supported on Radix"); + return; + } + + if (!kvmppc_has_cap_rpt_invalidate()) { + error_setg(errp, + "KVM implementation does not support H_RPT_INVALIDATE"); + error_append_hint(errp, + "Try appending -machine cap-rpt-invalidate=off\n"); + } else { + kvmppc_enable_h_rpt_invalidate(); + } + } +} + SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = { [SPAPR_CAP_HTM] = { .name = "htm", @@ -690,6 +721,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = { .type = "bool", .apply = cap_fwnmi_apply, }, + [SPAPR_CAP_RPT_INVALIDATE] = { + .name = "rpt-invalidate", + .description = "Allow H_RPT_INVALIDATE", + .index = SPAPR_CAP_RPT_INVALIDATE, + .get = spapr_cap_get_bool, + .set = spapr_cap_set_bool, + .type = "bool", + .apply = cap_rpt_invalidate_apply, + }, }; static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr, @@ -830,6 +870,7 @@ SPAPR_CAP_MIG_STATE(nested_kvm_hv, SPAPR_CAP_NESTED_KVM_HV); SPAPR_CAP_MIG_STATE(large_decr, SPAPR_CAP_LARGE_DECREMENTER); SPAPR_CAP_MIG_STATE(ccf_assist, SPAPR_CAP_CCF_ASSIST); SPAPR_CAP_MIG_STATE(fwnmi, SPAPR_CAP_FWNMI); +SPAPR_CAP_MIG_STATE(rpt_invalidate, SPAPR_CAP_RPT_INVALIDATE); void spapr_caps_init(SpaprMachineState *spapr) { diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index f25014a..0e9a5b2 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1233,8 +1233,7 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, spapr_setup_hpt(spapr); } - fdt = spapr_build_fdt(spapr, false, fdt_bufsize); - + fdt = spapr_build_fdt(spapr, spapr->vof != NULL, fdt_bufsize); g_free(spapr->fdt_blob); spapr->fdt_size = fdt_totalsize(fdt); spapr->fdt_initial_size = spapr->fdt_size; @@ -1277,6 +1276,25 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, return ret; } +target_ulong spapr_vof_client_architecture_support(MachineState *ms, + CPUState *cs, + target_ulong ovec_addr) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(ms); + + target_ulong ret = do_client_architecture_support(POWERPC_CPU(cs), spapr, + ovec_addr, FDT_MAX_SIZE); + + /* + * This adds stdout and generates phandles for boottime and CAS FDTs. + * It is alright to update the FDT here as do_client_architecture_support() + * does not pack it. + */ + spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob); + + return ret; +} + static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu, SpaprMachineState *spapr, target_ulong opcode, @@ -1299,6 +1317,8 @@ static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu, behaviour |= H_CPU_BEHAV_L1D_FLUSH_PR; break; case SPAPR_CAP_FIXED: + behaviour |= H_CPU_BEHAV_NO_L1D_FLUSH_ENTRY; + behaviour |= H_CPU_BEHAV_NO_L1D_FLUSH_UACCESS; break; default: /* broken */ assert(safe_cache == SPAPR_CAP_BROKEN); diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c new file mode 100644 index 0000000..40ce8fe --- /dev/null +++ b/hw/ppc/spapr_vof.c @@ -0,0 +1,167 @@ +/* + * SPAPR machine hooks to Virtual Open Firmware, + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qapi/error.h" +#include "hw/ppc/spapr.h" +#include "hw/ppc/spapr_vio.h" +#include "hw/ppc/spapr_cpu_core.h" +#include "hw/ppc/fdt.h" +#include "hw/ppc/vof.h" +#include "sysemu/sysemu.h" +#include "qom/qom-qobject.h" +#include "trace.h" + +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr, + target_ulong opcode, target_ulong *_args) +{ + int ret = vof_client_call(MACHINE(spapr), spapr->vof, spapr->fdt_blob, + ppc64_phys_to_real(_args[0])); + + if (ret) { + return H_PARAMETER; + } + return H_SUCCESS; +} + +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt) +{ + char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus); + + vof_build_dt(fdt, spapr->vof); + + if (spapr->vof->bootargs) { + int chosen; + + _FDT(chosen = fdt_path_offset(fdt, "/chosen")); + /* + * If the client did not change "bootargs", spapr_dt_chosen() must have + * stored machine->kernel_cmdline in it before getting here. + */ + _FDT(fdt_setprop_string(fdt, chosen, "bootargs", spapr->vof->bootargs)); + } + + /* + * SLOF-less setup requires an open instance of stdout for early + * kernel printk. By now all phandles are settled so we can open + * the default serial console. + */ + if (stdout_path) { + _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", "stdout", + stdout_path)); + } +} + +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt, Error **errp) +{ + target_ulong stack_ptr; + Vof *vof = spapr->vof; + PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu); + + vof_init(vof, spapr->rma_size, errp); + + stack_ptr = vof_claim(vof, 0, VOF_STACK_SIZE, VOF_STACK_SIZE); + if (stack_ptr == -1) { + error_setg(errp, "Memory allocation for stack failed"); + return; + } + /* Stack grows downwards plus reserve space for the minimum stack frame */ + stack_ptr += VOF_STACK_SIZE - 0x20; + + if (spapr->kernel_size && + vof_claim(vof, spapr->kernel_addr, spapr->kernel_size, 0) == -1) { + error_setg(errp, "Memory for kernel is in use"); + return; + } + + if (spapr->initrd_size && + vof_claim(vof, spapr->initrd_base, spapr->initrd_size, 0) == -1) { + error_setg(errp, "Memory for initramdisk is in use"); + return; + } + + spapr_vof_client_dt_finalize(spapr, fdt); + + spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, + stack_ptr, spapr->initrd_base, + spapr->initrd_size); + /* VOF is 32bit BE so enforce MSR here */ + first_ppc_cpu->env.msr &= ~((1ULL << MSR_SF) | (1ULL << MSR_LE)); + + /* + * At this point the expected allocation map is: + * + * 0..c38 - the initial firmware + * 8000..10000 - stack + * 400000.. - kernel + * 3ea0000.. - initramdisk + * + * We skip writing FDT as nothing expects it; OF client interface is + * going to be used for reading the device tree. + */ +} + +void spapr_vof_quiesce(MachineState *ms) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(ms); + + spapr->fdt_size = fdt_totalsize(spapr->fdt_blob); + spapr->fdt_initial_size = spapr->fdt_size; +} + +bool spapr_vof_setprop(MachineState *ms, const char *path, const char *propname, + void *val, int vallen) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(ms); + + /* + * We only allow changing properties which we know how to update in QEMU + * OR + * the ones which we know that they need to survive during "quiesce". + */ + + if (strcmp(path, "/rtas") == 0) { + if (strcmp(propname, "linux,rtas-base") == 0 || + strcmp(propname, "linux,rtas-entry") == 0) { + /* These need to survive quiesce so let them store in the FDT */ + return true; + } + } + + if (strcmp(path, "/chosen") == 0) { + if (strcmp(propname, "bootargs") == 0) { + Vof *vof = spapr->vof; + + g_free(vof->bootargs); + vof->bootargs = g_strndup(val, vallen); + return true; + } + if (strcmp(propname, "linux,initrd-start") == 0) { + if (vallen == sizeof(uint32_t)) { + spapr->initrd_base = ldl_be_p(val); + return true; + } + if (vallen == sizeof(uint64_t)) { + spapr->initrd_base = ldq_be_p(val); + return true; + } + return false; + } + if (strcmp(propname, "linux,initrd-end") == 0) { + if (vallen == sizeof(uint32_t)) { + spapr->initrd_size = ldl_be_p(val) - spapr->initrd_base; + return true; + } + if (vallen == sizeof(uint64_t)) { + spapr->initrd_size = ldq_be_p(val) - spapr->initrd_base; + return true; + } + return false; + } + } + + return true; +} diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events index 0ba3e40..6e90a01 100644 --- a/hw/ppc/trace-events +++ b/hw/ppc/trace-events @@ -71,6 +71,30 @@ spapr_rtas_ibm_configure_connector_invalid(uint32_t index) "DRC index: 0x%"PRIx3 spapr_vio_h_reg_crq(uint64_t reg, uint64_t queue_addr, uint64_t queue_len) "CRQ for dev 0x%" PRIx64 " registered at 0x%" PRIx64 "/0x%" PRIx64 spapr_vio_free_crq(uint32_t reg) "CRQ for dev 0x%" PRIx32 " freed" +# vof.c +vof_error_str_truncated(const char *s, int len) "%s truncated to %d" +vof_error_param(const char *method, int nargscheck, int nretcheck, int nargs, int nret) "%s takes/returns %d/%d, not %d/%d" +vof_error_unknown_service(const char *service, int nargs, int nret) "\"%s\" args=%d rets=%d" +vof_error_unknown_method(const char *method) "\"%s\"" +vof_error_unknown_ihandle_close(uint32_t ih) "ih=0x%x" +vof_error_unknown_path(const char *path) "\"%s\"" +vof_error_write(uint32_t ih) "ih=0x%x" +vof_finddevice(const char *path, uint32_t ph) "\"%s\" => ph=0x%x" +vof_claim(uint32_t virt, uint32_t size, uint32_t align, uint32_t ret) "virt=0x%x size=0x%x align=0x%x => 0x%x" +vof_release(uint32_t virt, uint32_t size, uint32_t ret) "virt=0x%x size=0x%x => 0x%x" +vof_method(uint32_t ihandle, const char *method, uint32_t param, uint32_t ret, uint32_t ret2) "ih=0x%x \"%s\"(0x%x) => 0x%x 0x%x" +vof_getprop(uint32_t ph, const char *prop, uint32_t ret, const char *val) "ph=0x%x \"%s\" => len=%d [%s]" +vof_getproplen(uint32_t ph, const char *prop, uint32_t ret) "ph=0x%x \"%s\" => len=%d" +vof_setprop(uint32_t ph, const char *prop, const char *val, uint32_t vallen, uint32_t ret) "ph=0x%x \"%s\" [%s] len=%d => ret=%d" +vof_open(const char *path, uint32_t ph, uint32_t ih) "%s ph=0x%x => ih=0x%x" +vof_interpret(const char *cmd, uint32_t param1, uint32_t param2, uint32_t ret, uint32_t ret2) "[%s] 0x%x 0x%x => 0x%x 0x%x" +vof_package_to_path(uint32_t ph, const char *tmp, uint32_t ret) "ph=0x%x => %s len=%d" +vof_instance_to_path(uint32_t ih, uint32_t ph, const char *tmp, uint32_t ret) "ih=0x%x ph=0x%x => %s len=%d" +vof_instance_to_package(uint32_t ih, uint32_t ph) "ih=0x%x => ph=0x%x" +vof_write(uint32_t ih, unsigned cb, const char *msg) "ih=0x%x [%u] \"%s\"" +vof_avail(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx64" size=0x%"PRIx64 +vof_claimed(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx64" size=0x%"PRIx64 + # ppc.c ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)" diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c new file mode 100644 index 0000000..81f6596 --- /dev/null +++ b/hw/ppc/vof.c @@ -0,0 +1,1053 @@ +/* + * QEMU PowerPC Virtual Open Firmware. + * + * This implements client interface from OpenFirmware IEEE1275 on the QEMU + * side to leave only a very basic firmware in the VM. + * + * Copyright (c) 2021 IBM Corporation. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/timer.h" +#include "qemu/range.h" +#include "qemu/units.h" +#include "qemu/log.h" +#include "qapi/error.h" +#include "exec/ram_addr.h" +#include "exec/address-spaces.h" +#include "hw/ppc/vof.h" +#include "hw/ppc/fdt.h" +#include "sysemu/runstate.h" +#include "qom/qom-qobject.h" +#include "trace.h" + +#include <libfdt.h> + +/* + * OF 1275 "nextprop" description suggests is it 32 bytes max but + * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 chars long. + */ +#define OF_PROPNAME_LEN_MAX 64 + +#define VOF_MAX_PATH 256 +#define VOF_MAX_SETPROPLEN 2048 +#define VOF_MAX_METHODLEN 256 +#define VOF_MAX_FORTHCODE 256 +#define VOF_VTY_BUF_SIZE 256 + +typedef struct { + uint64_t start; + uint64_t size; +} OfClaimed; + +typedef struct { + char *path; /* the path used to open the instance */ + uint32_t phandle; +} OfInstance; + +static int readstr(hwaddr pa, char *buf, int size) +{ + if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) { + return -1; + } + if (strnlen(buf, size) == size) { + buf[size - 1] = '\0'; + trace_vof_error_str_truncated(buf, size); + return -1; + } + return 0; +} + +static bool cmpservice(const char *s, unsigned nargs, unsigned nret, + const char *s1, unsigned nargscheck, unsigned nretcheck) +{ + if (strcmp(s, s1)) { + return false; + } + if ((nargscheck && (nargs != nargscheck)) || + (nretcheck && (nret != nretcheck))) { + trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret); + return false; + } + + return true; +} + +static void prop_format(char *tval, int tlen, const void *prop, int len) +{ + int i; + const unsigned char *c; + char *t; + const char bin[] = "..."; + + for (i = 0, c = prop; i < len; ++i, ++c) { + if (*c == '\0' && i == len - 1) { + strncpy(tval, prop, tlen - 1); + return; + } + if (*c < 0x20 || *c >= 0x80) { + break; + } + } + + for (i = 0, c = prop, t = tval; i < len; ++i, ++c) { + if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) { + strcpy(t, bin); + return; + } + if (i && i % 4 == 0 && i != len - 1) { + strcat(t, " "); + ++t; + } + t += sprintf(t, "%02X", *c & 0xFF); + } +} + +static int get_path(const void *fdt, int offset, char *buf, int len) +{ + int ret; + + ret = fdt_get_path(fdt, offset, buf, len - 1); + if (ret < 0) { + return ret; + } + + buf[len - 1] = '\0'; + + return strlen(buf) + 1; +} + +static int phandle_to_path(const void *fdt, uint32_t ph, char *buf, int len) +{ + int ret; + + ret = fdt_node_offset_by_phandle(fdt, ph); + if (ret < 0) { + return ret; + } + + return get_path(fdt, ret, buf, len); +} + +static int path_offset(const void *fdt, const char *path) +{ + g_autofree char *p = NULL; + char *at; + + /* + * https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html#HDR16 + * + * "Conversion from numeric representation to text representation shall use + * the lower case forms of the hexadecimal digits in the range a..f, + * suppressing leading zeros". + */ + p = g_strdup(path); + for (at = strchr(p, '@'); at && *at; ) { + if (*at == '/') { + at = strchr(at, '@'); + } else { + *at = tolower(*at); + ++at; + } + } + + return fdt_path_offset(fdt, p); +} + +static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr) +{ + char fullnode[VOF_MAX_PATH]; + uint32_t ret = -1; + int offset; + + if (readstr(nodeaddr, fullnode, sizeof(fullnode))) { + return (uint32_t) ret; + } + + offset = path_offset(fdt, fullnode); + if (offset >= 0) { + ret = fdt_get_phandle(fdt, offset); + } + trace_vof_finddevice(fullnode, ret); + return (uint32_t) ret; +} + +static const void *getprop(const void *fdt, int nodeoff, const char *propname, + int *proplen, bool *write0) +{ + const char *unit, *prop; + const void *ret = fdt_getprop(fdt, nodeoff, propname, proplen); + + if (ret) { + if (write0) { + *write0 = false; + } + return ret; + } + + if (strcmp(propname, "name")) { + return NULL; + } + /* + * We return a value for "name" from path if queried but property does not + * exist. @proplen does not include the unit part in this case. + */ + prop = fdt_get_name(fdt, nodeoff, proplen); + if (!prop) { + *proplen = 0; + return NULL; + } + + unit = memchr(prop, '@', *proplen); + if (unit) { + *proplen = unit - prop; + } + *proplen += 1; + + /* + * Since it might be cut at "@" and there will be no trailing zero + * in the prop buffer, tell the caller to write zero at the end. + */ + if (write0) { + *write0 = true; + } + return prop; +} + +static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, uint32_t pname, + uint32_t valaddr, uint32_t vallen) +{ + char propname[OF_PROPNAME_LEN_MAX + 1]; + uint32_t ret = 0; + int proplen = 0; + const void *prop; + char trval[64] = ""; + int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph); + bool write0; + + if (nodeoff < 0) { + return -1; + } + if (readstr(pname, propname, sizeof(propname))) { + return -1; + } + prop = getprop(fdt, nodeoff, propname, &proplen, &write0); + if (prop) { + const char zero = 0; + int cb = MIN(proplen, vallen); + + if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK || + /* if that was "name" with a unit address, overwrite '@' with '0' */ + (write0 && + cb == proplen && + VOF_MEM_WRITE(valaddr + cb - 1, &zero, 1) != MEMTX_OK)) { + ret = -1; + } else { + /* + * OF1275 says: + * "Size is either the actual size of the property, or -1 if name + * does not exist", hence returning proplen instead of cb. + */ + ret = proplen; + /* Do not format a value if tracepoint is silent, for performance */ + if (trace_event_get_state(TRACE_VOF_GETPROP) && + qemu_loglevel_mask(LOG_TRACE)) { + prop_format(trval, sizeof(trval), prop, ret); + } + } + } else { + ret = -1; + } + trace_vof_getprop(nodeph, propname, ret, trval); + + return ret; +} + +static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, uint32_t pname) +{ + char propname[OF_PROPNAME_LEN_MAX + 1]; + uint32_t ret = 0; + int proplen = 0; + const void *prop; + int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph); + + if (nodeoff < 0) { + return -1; + } + if (readstr(pname, propname, sizeof(propname))) { + return -1; + } + prop = getprop(fdt, nodeoff, propname, &proplen, NULL); + if (prop) { + ret = proplen; + } else { + ret = -1; + } + trace_vof_getproplen(nodeph, propname, ret); + + return ret; +} + +static uint32_t vof_setprop(MachineState *ms, void *fdt, Vof *vof, + uint32_t nodeph, uint32_t pname, + uint32_t valaddr, uint32_t vallen) +{ + char propname[OF_PROPNAME_LEN_MAX + 1]; + uint32_t ret = -1; + int offset; + char trval[64] = ""; + char nodepath[VOF_MAX_PATH] = ""; + Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF); + VofMachineIfClass *vmc; + g_autofree char *val = NULL; + + if (vallen > VOF_MAX_SETPROPLEN) { + goto trace_exit; + } + if (readstr(pname, propname, sizeof(propname))) { + goto trace_exit; + } + offset = fdt_node_offset_by_phandle(fdt, nodeph); + if (offset < 0) { + goto trace_exit; + } + ret = get_path(fdt, offset, nodepath, sizeof(nodepath)); + if (ret <= 0) { + goto trace_exit; + } + + val = g_malloc0(vallen); + if (VOF_MEM_READ(valaddr, val, vallen) != MEMTX_OK) { + goto trace_exit; + } + + if (!vmo) { + goto trace_exit; + } + + vmc = VOF_MACHINE_GET_CLASS(vmo); + if (!vmc->setprop || !vmc->setprop(ms, nodepath, propname, val, vallen)) { + goto trace_exit; + } + + ret = fdt_setprop(fdt, offset, propname, val, vallen); + if (ret) { + goto trace_exit; + } + + if (trace_event_get_state(TRACE_VOF_SETPROP) && + qemu_loglevel_mask(LOG_TRACE)) { + prop_format(trval, sizeof(trval), val, vallen); + } + ret = vallen; + +trace_exit: + trace_vof_setprop(nodeph, propname, trval, vallen, ret); + + return ret; +} + +static uint32_t vof_nextprop(const void *fdt, uint32_t phandle, + uint32_t prevaddr, uint32_t nameaddr) +{ + int offset, nodeoff = fdt_node_offset_by_phandle(fdt, phandle); + char prev[OF_PROPNAME_LEN_MAX + 1]; + const char *tmp; + + if (readstr(prevaddr, prev, sizeof(prev))) { + return -1; + } + + fdt_for_each_property_offset(offset, fdt, nodeoff) { + if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) { + return 0; + } + if (prev[0] == '\0' || strcmp(prev, tmp) == 0) { + if (prev[0] != '\0') { + offset = fdt_next_property_offset(fdt, offset); + if (offset < 0) { + return 0; + } + } + if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) { + return 0; + } + + if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != MEMTX_OK) { + return -1; + } + return 1; + } + } + + return 0; +} + +static uint32_t vof_peer(const void *fdt, uint32_t phandle) +{ + int ret; + + if (phandle == 0) { + ret = fdt_path_offset(fdt, "/"); + } else { + ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle)); + } + + if (ret < 0) { + ret = 0; + } else { + ret = fdt_get_phandle(fdt, ret); + } + + return ret; +} + +static uint32_t vof_child(const void *fdt, uint32_t phandle) +{ + int ret = fdt_first_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle)); + + if (ret < 0) { + ret = 0; + } else { + ret = fdt_get_phandle(fdt, ret); + } + + return ret; +} + +static uint32_t vof_parent(const void *fdt, uint32_t phandle) +{ + int ret = fdt_parent_offset(fdt, fdt_node_offset_by_phandle(fdt, phandle)); + + if (ret < 0) { + ret = 0; + } else { + ret = fdt_get_phandle(fdt, ret); + } + + return ret; +} + +static uint32_t vof_do_open(void *fdt, Vof *vof, int offset, const char *path) +{ + uint32_t ret = -1; + OfInstance *inst = NULL; + + if (vof->of_instance_last == 0xFFFFFFFF) { + /* We do not recycle ihandles yet */ + goto trace_exit; + } + + inst = g_new0(OfInstance, 1); + inst->phandle = fdt_get_phandle(fdt, offset); + g_assert(inst->phandle); + ++vof->of_instance_last; + + inst->path = g_strdup(path); + g_hash_table_insert(vof->of_instances, + GINT_TO_POINTER(vof->of_instance_last), + inst); + ret = vof->of_instance_last; + +trace_exit: + trace_vof_open(path, inst ? inst->phandle : 0, ret); + + return ret; +} + +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename, + const char *prop, const char *path) +{ + int node = fdt_path_offset(fdt, nodename); + int inst, offset; + + offset = fdt_path_offset(fdt, path); + if (offset < 0) { + trace_vof_error_unknown_path(path); + return offset; + } + + inst = vof_do_open(fdt, vof, offset, path); + + return fdt_setprop_cell(fdt, node, prop, inst); +} + +static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr) +{ + char path[VOF_MAX_PATH]; + int offset; + + if (readstr(pathaddr, path, sizeof(path))) { + return -1; + } + + offset = path_offset(fdt, path); + if (offset < 0) { + trace_vof_error_unknown_path(path); + return offset; + } + + return vof_do_open(fdt, vof, offset, path); +} + +static void vof_close(Vof *vof, uint32_t ihandle) +{ + if (!g_hash_table_remove(vof->of_instances, GINT_TO_POINTER(ihandle))) { + trace_vof_error_unknown_ihandle_close(ihandle); + } +} + +static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle) +{ + gpointer instp = g_hash_table_lookup(vof->of_instances, + GINT_TO_POINTER(ihandle)); + uint32_t ret = -1; + + if (instp) { + ret = ((OfInstance *)instp)->phandle; + } + trace_vof_instance_to_package(ihandle, ret); + + return ret; +} + +static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle, + uint32_t buf, uint32_t len) +{ + uint32_t ret = -1; + char tmp[VOF_MAX_PATH] = ""; + + ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp)); + if (ret > 0) { + if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) { + ret = -1; + } + } + + trace_vof_package_to_path(phandle, tmp, ret); + + return ret; +} + +static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t ihandle, + uint32_t buf, uint32_t len) +{ + uint32_t ret = -1; + uint32_t phandle = vof_instance_to_package(vof, ihandle); + char tmp[VOF_MAX_PATH] = ""; + + if (phandle != -1) { + ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp)); + if (ret > 0) { + if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) { + ret = -1; + } + } + } + trace_vof_instance_to_path(ihandle, phandle, tmp, ret); + + return ret; +} + +static uint32_t vof_write(Vof *vof, uint32_t ihandle, uint32_t buf, + uint32_t len) +{ + char tmp[VOF_VTY_BUF_SIZE]; + unsigned cb; + OfInstance *inst = (OfInstance *) + g_hash_table_lookup(vof->of_instances, GINT_TO_POINTER(ihandle)); + + if (!inst) { + trace_vof_error_write(ihandle); + return -1; + } + + for ( ; len > 0; len -= cb) { + cb = MIN(len, sizeof(tmp) - 1); + if (VOF_MEM_READ(buf, tmp, cb) != MEMTX_OK) { + return -1; + } + + /* FIXME: there is no backend(s) yet so just call a trace */ + if (trace_event_get_state(TRACE_VOF_WRITE) && + qemu_loglevel_mask(LOG_TRACE)) { + tmp[cb] = '\0'; + trace_vof_write(ihandle, cb, tmp); + } + } + + return len; +} + +static void vof_claimed_dump(GArray *claimed) +{ + int i; + OfClaimed c; + + if (trace_event_get_state(TRACE_VOF_CLAIMED) && + qemu_loglevel_mask(LOG_TRACE)) { + + for (i = 0; i < claimed->len; ++i) { + c = g_array_index(claimed, OfClaimed, i); + trace_vof_claimed(c.start, c.start + c.size, c.size); + } + } +} + +static bool vof_claim_avail(GArray *claimed, uint64_t virt, uint64_t size) +{ + int i; + OfClaimed c; + + for (i = 0; i < claimed->len; ++i) { + c = g_array_index(claimed, OfClaimed, i); + if (ranges_overlap(c.start, c.size, virt, size)) { + return false; + } + } + + return true; +} + +static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t size) +{ + OfClaimed newclaim; + + newclaim.start = virt; + newclaim.size = size; + g_array_append_val(claimed, newclaim); +} + +static gint of_claimed_compare_func(gconstpointer a, gconstpointer b) +{ + return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start; +} + +static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base) +{ + int i, n, offset, proplen = 0, sc, ac; + target_ulong mem0_end; + const uint8_t *mem0_reg; + g_autofree uint8_t *avail = NULL; + uint8_t *availcur; + + if (!fdt || !claimed) { + return; + } + + offset = fdt_path_offset(fdt, "/"); + _FDT(offset); + ac = fdt_address_cells(fdt, offset); + g_assert(ac == 1 || ac == 2); + sc = fdt_size_cells(fdt, offset); + g_assert(sc == 1 || sc == 2); + + offset = fdt_path_offset(fdt, "/memory@0"); + _FDT(offset); + + mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen); + g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc)); + if (sc == 2) { + mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + sizeof(uint32_t) * ac)); + } else { + mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + sizeof(uint32_t) * ac)); + } + + g_array_sort(claimed, of_claimed_compare_func); + vof_claimed_dump(claimed); + + /* + * VOF resides in the first page so we do not need to check if there is + * available memory before the first claimed block + */ + g_assert(claimed->len && (g_array_index(claimed, OfClaimed, 0).start == 0)); + + avail = g_malloc0(sizeof(uint32_t) * (ac + sc) * claimed->len); + for (i = 0, n = 0, availcur = avail; i < claimed->len; ++i) { + OfClaimed c = g_array_index(claimed, OfClaimed, i); + uint64_t start, size; + + start = c.start + c.size; + if (i < claimed->len - 1) { + OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1); + + size = cn.start - start; + } else { + size = mem0_end - start; + } + + if (ac == 2) { + *(uint64_t *) availcur = cpu_to_be64(start); + } else { + *(uint32_t *) availcur = cpu_to_be32(start); + } + availcur += sizeof(uint32_t) * ac; + if (sc == 2) { + *(uint64_t *) availcur = cpu_to_be64(size); + } else { + *(uint32_t *) availcur = cpu_to_be32(size); + } + availcur += sizeof(uint32_t) * sc; + + if (size) { + trace_vof_avail(c.start + c.size, c.start + c.size + size, size); + ++n; + } + } + _FDT((fdt_setprop(fdt, offset, "available", avail, availcur - avail))); +} + +/* + * OF1275: + * "Allocates size bytes of memory. If align is zero, the allocated range + * begins at the virtual address virt. Otherwise, an aligned address is + * automatically chosen and the input argument virt is ignored". + * + * In other words, exactly one of @virt and @align is non-zero. + */ +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, + uint64_t align) +{ + uint64_t ret; + + if (size == 0) { + ret = -1; + } else if (align == 0) { + if (!vof_claim_avail(vof->claimed, virt, size)) { + ret = -1; + } else { + ret = virt; + } + } else { + vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align); + while (1) { + if (vof->claimed_base >= vof->top_addr) { + error_report("Out of RMA memory for the OF client"); + return -1; + } + if (vof_claim_avail(vof->claimed, vof->claimed_base, size)) { + break; + } + vof->claimed_base += size; + } + ret = vof->claimed_base; + } + + if (ret != -1) { + vof->claimed_base = MAX(vof->claimed_base, ret + size); + vof_claim_add(vof->claimed, ret, size); + } + trace_vof_claim(virt, size, align, ret); + + return ret; +} + +static uint32_t vof_release(Vof *vof, uint64_t virt, uint64_t size) +{ + uint32_t ret = -1; + int i; + GArray *claimed = vof->claimed; + OfClaimed c; + + for (i = 0; i < claimed->len; ++i) { + c = g_array_index(claimed, OfClaimed, i); + if (c.start == virt && c.size == size) { + g_array_remove_index(claimed, i); + ret = 0; + break; + } + } + + trace_vof_release(virt, size, ret); + + return ret; +} + +static void vof_instantiate_rtas(Error **errp) +{ + error_setg(errp, "The firmware should have instantiated RTAS"); +} + +static uint32_t vof_call_method(MachineState *ms, Vof *vof, uint32_t methodaddr, + uint32_t ihandle, uint32_t param1, + uint32_t param2, uint32_t param3, + uint32_t param4, uint32_t *ret2) +{ + uint32_t ret = -1; + char method[VOF_MAX_METHODLEN] = ""; + OfInstance *inst; + + if (!ihandle) { + goto trace_exit; + } + + inst = (OfInstance *)g_hash_table_lookup(vof->of_instances, + GINT_TO_POINTER(ihandle)); + if (!inst) { + goto trace_exit; + } + + if (readstr(methodaddr, method, sizeof(method))) { + goto trace_exit; + } + + if (strcmp(inst->path, "/") == 0) { + if (strcmp(method, "ibm,client-architecture-support") == 0) { + Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF); + + if (vmo) { + VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo); + + g_assert(vmc->client_architecture_support); + ret = vmc->client_architecture_support(ms, first_cpu, param1); + } + + *ret2 = 0; + } + } else if (strcmp(inst->path, "/rtas") == 0) { + if (strcmp(method, "instantiate-rtas") == 0) { + vof_instantiate_rtas(&error_fatal); + ret = 0; + *ret2 = param1; /* rtas-base */ + } + } else { + trace_vof_error_unknown_method(method); + } + +trace_exit: + trace_vof_method(ihandle, method, param1, ret, *ret2); + + return ret; +} + +static uint32_t vof_call_interpret(uint32_t cmdaddr, uint32_t param1, + uint32_t param2, uint32_t *ret2) +{ + uint32_t ret = -1; + char cmd[VOF_MAX_FORTHCODE] = ""; + + /* No interpret implemented so just call a trace */ + readstr(cmdaddr, cmd, sizeof(cmd)); + trace_vof_interpret(cmd, param1, param2, ret, *ret2); + + return ret; +} + +static void vof_quiesce(MachineState *ms, void *fdt, Vof *vof) +{ + Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF); + /* After "quiesce", no change is expected to the FDT, pack FDT to ensure */ + int rc = fdt_pack(fdt); + + assert(rc == 0); + + if (vmo) { + VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo); + + if (vmc->quiesce) { + vmc->quiesce(ms); + } + } + + vof_claimed_dump(vof->claimed); +} + +static uint32_t vof_client_handle(MachineState *ms, void *fdt, Vof *vof, + const char *service, + uint32_t *args, unsigned nargs, + uint32_t *rets, unsigned nrets) +{ + uint32_t ret = 0; + + /* @nrets includes the value which this function returns */ +#define cmpserv(s, a, r) \ + cmpservice(service, nargs, nrets, (s), (a), (r)) + + if (cmpserv("finddevice", 1, 1)) { + ret = vof_finddevice(fdt, args[0]); + } else if (cmpserv("getprop", 4, 1)) { + ret = vof_getprop(fdt, args[0], args[1], args[2], args[3]); + } else if (cmpserv("getproplen", 2, 1)) { + ret = vof_getproplen(fdt, args[0], args[1]); + } else if (cmpserv("setprop", 4, 1)) { + ret = vof_setprop(ms, fdt, vof, args[0], args[1], args[2], args[3]); + } else if (cmpserv("nextprop", 3, 1)) { + ret = vof_nextprop(fdt, args[0], args[1], args[2]); + } else if (cmpserv("peer", 1, 1)) { + ret = vof_peer(fdt, args[0]); + } else if (cmpserv("child", 1, 1)) { + ret = vof_child(fdt, args[0]); + } else if (cmpserv("parent", 1, 1)) { + ret = vof_parent(fdt, args[0]); + } else if (cmpserv("open", 1, 1)) { + ret = vof_open(fdt, vof, args[0]); + } else if (cmpserv("close", 1, 0)) { + vof_close(vof, args[0]); + } else if (cmpserv("instance-to-package", 1, 1)) { + ret = vof_instance_to_package(vof, args[0]); + } else if (cmpserv("package-to-path", 3, 1)) { + ret = vof_package_to_path(fdt, args[0], args[1], args[2]); + } else if (cmpserv("instance-to-path", 3, 1)) { + ret = vof_instance_to_path(fdt, vof, args[0], args[1], args[2]); + } else if (cmpserv("write", 3, 1)) { + ret = vof_write(vof, args[0], args[1], args[2]); + } else if (cmpserv("claim", 3, 1)) { + ret = vof_claim(vof, args[0], args[1], args[2]); + if (ret != -1) { + vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base); + } + } else if (cmpserv("release", 2, 0)) { + ret = vof_release(vof, args[0], args[1]); + if (ret != -1) { + vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base); + } + } else if (cmpserv("call-method", 0, 0)) { + ret = vof_call_method(ms, vof, args[0], args[1], args[2], args[3], + args[4], args[5], rets); + } else if (cmpserv("interpret", 0, 0)) { + ret = vof_call_interpret(args[0], args[1], args[2], rets); + } else if (cmpserv("milliseconds", 0, 1)) { + ret = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); + } else if (cmpserv("quiesce", 0, 0)) { + vof_quiesce(ms, fdt, vof); + } else if (cmpserv("exit", 0, 0)) { + error_report("Stopped as the VM requested \"exit\""); + vm_stop(RUN_STATE_PAUSED); + } else { + trace_vof_error_unknown_service(service, nargs, nrets); + ret = -1; + } + +#undef cmpserv + + return ret; +} + +/* Defined as Big Endian */ +struct prom_args { + uint32_t service; + uint32_t nargs; + uint32_t nret; + uint32_t args[10]; +} QEMU_PACKED; + +int vof_client_call(MachineState *ms, Vof *vof, void *fdt, + target_ulong args_real) +{ + struct prom_args args_be; + uint32_t args[ARRAY_SIZE(args_be.args)]; + uint32_t rets[ARRAY_SIZE(args_be.args)] = { 0 }, ret; + char service[64]; + unsigned nargs, nret, i; + + if (VOF_MEM_READ(args_real, &args_be, sizeof(args_be)) != MEMTX_OK) { + return -EINVAL; + } + nargs = be32_to_cpu(args_be.nargs); + if (nargs >= ARRAY_SIZE(args_be.args)) { + return -EINVAL; + } + + if (VOF_MEM_READ(be32_to_cpu(args_be.service), service, sizeof(service)) != + MEMTX_OK) { + return -EINVAL; + } + if (strnlen(service, sizeof(service)) == sizeof(service)) { + /* Too long service name */ + return -EINVAL; + } + + for (i = 0; i < nargs; ++i) { + args[i] = be32_to_cpu(args_be.args[i]); + } + + nret = be32_to_cpu(args_be.nret); + ret = vof_client_handle(ms, fdt, vof, service, args, nargs, rets, nret); + if (!nret) { + return 0; + } + + args_be.args[nargs] = cpu_to_be32(ret); + for (i = 1; i < nret; ++i) { + args_be.args[nargs + i] = cpu_to_be32(rets[i - 1]); + } + + if (VOF_MEM_WRITE(args_real + offsetof(struct prom_args, args[nargs]), + args_be.args + nargs, sizeof(args_be.args[0]) * nret) != + MEMTX_OK) { + return -EINVAL; + } + + return 0; +} + +static void vof_instance_free(gpointer data) +{ + OfInstance *inst = (OfInstance *)data; + + g_free(inst->path); + g_free(inst); +} + +void vof_init(Vof *vof, uint64_t top_addr, Error **errp) +{ + vof_cleanup(vof); + + vof->of_instances = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, vof_instance_free); + vof->claimed = g_array_new(false, false, sizeof(OfClaimed)); + + /* Keep allocations in 32bit as CLI ABI can only return cells==32bit */ + vof->top_addr = MIN(top_addr, 4 * GiB); + if (vof_claim(vof, 0, vof->fw_size, 0) == -1) { + error_setg(errp, "Memory for firmware is in use"); + } +} + +void vof_cleanup(Vof *vof) +{ + if (vof->claimed) { + g_array_unref(vof->claimed); + } + if (vof->of_instances) { + g_hash_table_unref(vof->of_instances); + } + vof->claimed = NULL; + vof->of_instances = NULL; +} + +void vof_build_dt(void *fdt, Vof *vof) +{ + uint32_t phandle = fdt_get_max_phandle(fdt); + int offset, proplen = 0; + const void *prop; + + /* Assign phandles to nodes without predefined phandles (like XICS/XIVE) */ + for (offset = fdt_next_node(fdt, -1, NULL); + offset >= 0; + offset = fdt_next_node(fdt, offset, NULL)) { + prop = fdt_getprop(fdt, offset, "phandle", &proplen); + if (prop) { + continue; + } + ++phandle; + _FDT(fdt_setprop_cell(fdt, offset, "phandle", phandle)); + } + + vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base); +} + +static const TypeInfo vof_machine_if_info = { + .name = TYPE_VOF_MACHINE_IF, + .parent = TYPE_INTERFACE, + .class_size = sizeof(VofMachineIfClass), +}; + +static void vof_machine_if_register_types(void) +{ + type_register_static(&vof_machine_if_info); +} +type_init(vof_machine_if_register_types) diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index d68888f..6a2df1c 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -31,6 +31,7 @@ #include "trace.h" #include "hw/s390x/css-bridge.h" #include "hw/s390x/s390-virtio-ccw.h" +#include "sysemu/replay.h" #define NR_CLASSIC_INDICATOR_BITS 64 @@ -770,6 +771,11 @@ static void virtio_ccw_device_realize(VirtioCcwDevice *dev, Error **errp) dev->flags &= ~VIRTIO_CCW_FLAG_USE_IOEVENTFD; } + /* fd-based ioevents can't be synchronized in record/replay */ + if (replay_mode != REPLAY_MODE_NONE) { + dev->flags &= ~VIRTIO_CCW_FLAG_USE_IOEVENTFD; + } + if (k->realize) { k->realize(dev, &err); if (err) { diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c index 28e0032..18eb824 100644 --- a/hw/scsi/virtio-scsi-dataplane.c +++ b/hw/scsi/virtio-scsi-dataplane.c @@ -152,6 +152,10 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev) goto fail_guest_notifiers; } + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); rc = virtio_scsi_set_host_notifier(s, vs->ctrl_vq, 0); @@ -198,6 +202,10 @@ fail_host_notifiers: virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); for (i = 0; i < vq_init_count; i++) { @@ -238,12 +246,20 @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev) blk_drain_all(); /* ensure there are no in-flight requests */ + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); for (i = 0; i < vs->conf.num_queues + 2; i++) { virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); for (i = 0; i < vs->conf.num_queues + 2; i++) { diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ae5654f..3f0d111 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -36,6 +36,7 @@ #include "qemu/range.h" #include "sysemu/kvm.h" #include "sysemu/reset.h" +#include "sysemu/runstate.h" #include "trace.h" #include "qapi/error.h" #include "migration/migration.h" @@ -134,6 +135,29 @@ static const char *index_to_str(VFIODevice *vbasedev, int index) } } +static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) +{ + switch (container->iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + /* + * We support coordinated discarding of RAM via the RamDiscardManager. + */ + return ram_block_uncoordinated_discard_disable(state); + default: + /* + * VFIO_SPAPR_TCE_IOMMU most probably works just fine with + * RamDiscardManager, however, it is completely untested. + * + * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does + * completely the opposite of managing mapping/pinning dynamically as + * required by RamDiscardManager. We would have to special-case sections + * with a RamDiscardManager. + */ + return ram_block_discard_disable(state); + } +} + int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, int action, int fd, Error **errp) { @@ -569,6 +593,44 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, error_report("iommu map to non memory area %"HWADDR_PRIx"", xlat); return false; + } else if (memory_region_has_ram_discard_manager(mr)) { + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr); + MemoryRegionSection tmp = { + .mr = mr, + .offset_within_region = xlat, + .size = int128_make64(len), + }; + + /* + * Malicious VMs can map memory into the IOMMU, which is expected + * to remain discarded. vfio will pin all pages, populating memory. + * Disallow that. vmstate priorities make sure any RamDiscardManager + * were already restored before IOMMUs are restored. + */ + if (!ram_discard_manager_is_populated(rdm, &tmp)) { + error_report("iommu map to discarded memory (e.g., unplugged via" + " virtio-mem): %"HWADDR_PRIx"", + iotlb->translated_addr); + return false; + } + + /* + * Malicious VMs might trigger discarding of IOMMU-mapped memory. The + * pages will remain pinned inside vfio until unmapped, resulting in a + * higher memory consumption than expected. If memory would get + * populated again later, there would be an inconsistency between pages + * pinned by vfio and pages seen by QEMU. This is the case until + * unmapped from the IOMMU (e.g., during device reset). + * + * With malicious guests, we really only care about pinning more memory + * than expected. RLIMIT_MEMLOCK set for the user/process can never be + * exceeded and can be used to mitigate this problem. + */ + warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" + " RAM (e.g., virtio-mem) works, however, malicious" + " guests can trigger pinning of more memory than" + " intended via an IOMMU. It's possible to mitigate " + " by setting/adjusting RLIMIT_MEMLOCK."); } /* @@ -649,6 +711,153 @@ out: rcu_read_unlock(); } +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + int ret; + + /* Unmap with a single call. */ + ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); + if (ret) { + error_report("%s: vfio_dma_unmap() failed: %s", __func__, + strerror(-ret)); + } +} + +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + const hwaddr end = section->offset_within_region + + int128_get64(section->size); + hwaddr start, next, iova; + void *vaddr; + int ret; + + /* + * Map in (aligned within memory region) minimum granularity, so we can + * unmap in minimum granularity later. + */ + for (start = section->offset_within_region; start < end; start = next) { + next = ROUND_UP(start + 1, vrdl->granularity); + next = MIN(next, end); + + iova = start - section->offset_within_region + + section->offset_within_address_space; + vaddr = memory_region_get_ram_ptr(section->mr) + start; + + ret = vfio_dma_map(vrdl->container, iova, next - start, + vaddr, section->readonly); + if (ret) { + /* Rollback */ + vfio_ram_discard_notify_discard(rdl, section); + return ret; + } + } + return 0; +} + +static void vfio_register_ram_discard_listener(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl; + + /* Ignore some corner cases not relevant in practice. */ + g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); + g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, + TARGET_PAGE_SIZE)); + g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); + + vrdl = g_new0(VFIORamDiscardListener, 1); + vrdl->container = container; + vrdl->mr = section->mr; + vrdl->offset_within_address_space = section->offset_within_address_space; + vrdl->size = int128_get64(section->size); + vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, + section->mr); + + g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); + g_assert(vrdl->granularity >= 1 << ctz64(container->pgsizes)); + + ram_discard_listener_init(&vrdl->listener, + vfio_ram_discard_notify_populate, + vfio_ram_discard_notify_discard, true); + ram_discard_manager_register_listener(rdm, &vrdl->listener, section); + QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); + + /* + * Sanity-check if we have a theoretically problematic setup where we could + * exceed the maximum number of possible DMA mappings over time. We assume + * that each mapped section in the same address space as a RamDiscardManager + * section consumes exactly one DMA mapping, with the exception of + * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections + * in the same address space as RamDiscardManager sections. + * + * We assume that each section in the address space consumes one memslot. + * We take the number of KVM memory slots as a best guess for the maximum + * number of sections in the address space we could have over time, + * also consuming DMA mappings. + */ + if (container->dma_max_mappings) { + unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; + +#ifdef CONFIG_KVM + if (kvm_enabled()) { + max_memslots = kvm_get_max_memslots(); + } +#endif + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + hwaddr start, end; + + start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, + vrdl->granularity); + end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, + vrdl->granularity); + vrdl_mappings += (end - start) / vrdl->granularity; + vrdl_count++; + } + + if (vrdl_mappings + max_memslots - vrdl_count > + container->dma_max_mappings) { + warn_report("%s: possibly running out of DMA mappings. E.g., try" + " increasing the 'block-size' of virtio-mem devies." + " Maximum possible DMA mappings: %d, Maximum possible" + " memslots: %d", __func__, container->dma_max_mappings, + max_memslots); + } + } +} + +static void vfio_unregister_ram_discard_listener(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to unregister missing RAM discard listener"); + } + + ram_discard_manager_unregister_listener(rdm, &vrdl->listener); + QLIST_REMOVE(vrdl, next); + g_free(vrdl); +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { @@ -810,6 +1019,16 @@ static void vfio_listener_region_add(MemoryListener *listener, /* Here we assume that memory_region_is_ram(section->mr)==true */ + /* + * For RAM memory regions with a RamDiscardManager, we only want to map the + * actually populated parts - and update the mapping whenever we're notified + * about changes. + */ + if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_register_ram_discard_listener(container, section); + return; + } + vaddr = memory_region_get_ram_ptr(section->mr) + section->offset_within_region + (iova - section->offset_within_address_space); @@ -947,6 +1166,10 @@ static void vfio_listener_region_del(MemoryListener *listener, pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } else if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_unregister_ram_discard_listener(container, section); + /* Unregistering will trigger an unmap. */ + try_unmap = false; } if (try_unmap) { @@ -1108,6 +1331,49 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_unlock(); } +static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, + void *opaque) +{ + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + VFIORamDiscardListener *vrdl = opaque; + + /* + * Sync the whole mapped region (spanning multiple individual mappings) + * in one go. + */ + return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); +} + +static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + } + + /* + * We only want/can synchronize the bitmap for actually mapped parts - + * which correspond to populated parts. Replay all populated parts. + */ + return ram_discard_manager_replay_populated(rdm, section, + vfio_ram_discard_get_dirty_bitmap, + &vrdl); +} + static int vfio_sync_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { @@ -1139,6 +1405,8 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, } } return 0; + } else if (memory_region_has_ram_discard_manager(section->mr)) { + return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); } ram_addr = memory_region_get_ram_addr(section->mr) + @@ -1732,15 +2000,25 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * new memory, it will not yet set ram_block_discard_set_required() and * therefore, neither stops us here or deals with the sudden memory * consumption of inflated memory. + * + * We do support discarding of memory coordinated via the RamDiscardManager + * with some IOMMU types. vfio_ram_block_discard_disable() handles the + * details once we know which type of IOMMU we are using. */ - ret = ram_block_discard_disable(true); - if (ret) { - error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - return ret; - } QLIST_FOREACH(container, &space->containers, next) { if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, + "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, + &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); + } + return ret; + } group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); vfio_kvm_device_add_group(group); @@ -1768,14 +2046,22 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->fd = fd; container->error = NULL; container->dirty_pages_supported = false; + container->dma_max_mappings = 0; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&container->vrdl_list); ret = vfio_init_container(container, group->fd, errp); if (ret) { goto free_container_exit; } + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + goto free_container_exit; + } + switch (container->iommu_type) { case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: @@ -1798,7 +2084,10 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes); container->pgsizes = info->iova_pgsizes; + /* The default in the kernel ("dma_entry_limit") is 65535. */ + container->dma_max_mappings = 65535; if (!ret) { + vfio_get_info_dma_avail(info, &container->dma_max_mappings); vfio_get_iommu_info_migration(container, info); } g_free(info); @@ -1820,7 +2109,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (ret) { error_setg_errno(errp, errno, "failed to enable container"); ret = -errno; - goto free_container_exit; + goto enable_discards_exit; } } else { container->prereg_listener = vfio_prereg_listener; @@ -1832,7 +2121,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, ret = -1; error_propagate_prepend(errp, container->error, "RAM memory listener initialization failed: "); - goto free_container_exit; + goto enable_discards_exit; } } @@ -1845,7 +2134,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (v2) { memory_listener_unregister(&container->prereg_listener); } - goto free_container_exit; + goto enable_discards_exit; } if (v2) { @@ -1860,7 +2149,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (ret) { error_setg_errno(errp, -ret, "failed to remove existing window"); - goto free_container_exit; + goto enable_discards_exit; } } else { /* The default table uses 4K pages */ @@ -1901,6 +2190,9 @@ listener_release_exit: vfio_kvm_device_del_group(group); vfio_listener_release(container); +enable_discards_exit: + vfio_ram_block_discard_disable(container, false); + free_container_exit: g_free(container); @@ -1908,7 +2200,6 @@ close_fd_exit: close(fd); put_space_exit: - ram_block_discard_disable(false); vfio_put_address_space(space); return ret; @@ -2030,7 +2321,7 @@ void vfio_put_group(VFIOGroup *group) } if (!group->ram_block_discard_allowed) { - ram_block_discard_disable(false); + vfio_ram_block_discard_disable(group->container, false); } vfio_kvm_device_del_group(group); vfio_disconnect_container(group); @@ -2084,7 +2375,7 @@ int vfio_get_device(VFIOGroup *group, const char *name, if (!group->ram_block_discard_allowed) { group->ram_block_discard_allowed = true; - ram_block_discard_disable(false); + vfio_ram_block_discard_disable(group->container, false); } } diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 75aa7d6..df91e45 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -145,7 +145,173 @@ static bool virtio_mem_is_busy(void) return migration_in_incoming_postcopy() || !migration_is_idle(); } -static bool virtio_mem_test_bitmap(VirtIOMEM *vmem, uint64_t start_gpa, +typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size); + +static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg, + virtio_mem_range_cb cb) +{ + unsigned long first_zero_bit, last_zero_bit; + uint64_t offset, size; + int ret = 0; + + first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); + while (first_zero_bit < vmem->bitmap_size) { + offset = first_zero_bit * vmem->block_size; + last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + first_zero_bit + 1) - 1; + size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + + ret = cb(vmem, arg, offset, size); + if (ret) { + break; + } + first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + last_zero_bit + 2); + } + return ret; +} + +/* + * Adjust the memory section to cover the intersection with the given range. + * + * Returns false if the intersection is empty, otherwise returns true. + */ +static bool virito_mem_intersect_memory_section(MemoryRegionSection *s, + uint64_t offset, uint64_t size) +{ + uint64_t start = MAX(s->offset_within_region, offset); + uint64_t end = MIN(s->offset_within_region + int128_get64(s->size), + offset + size); + + if (end <= start) { + return false; + } + + s->offset_within_address_space += start - s->offset_within_region; + s->offset_within_region = start; + s->size = int128_make64(end - start); + return true; +} + +typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg); + +static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, + MemoryRegionSection *s, + void *arg, + virtio_mem_section_cb cb) +{ + unsigned long first_bit, last_bit; + uint64_t offset, size; + int ret = 0; + + first_bit = s->offset_within_region / vmem->bitmap_size; + first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit); + while (first_bit < vmem->bitmap_size) { + MemoryRegionSection tmp = *s; + + offset = first_bit * vmem->block_size; + last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + first_bit + 1) - 1; + size = (last_bit - first_bit + 1) * vmem->block_size; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + break; + } + ret = cb(&tmp, arg); + if (ret) { + break; + } + first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + last_bit + 2); + } + return ret; +} + +static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg) +{ + RamDiscardListener *rdl = arg; + + return rdl->notify_populate(rdl, s); +} + +static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg) +{ + RamDiscardListener *rdl = arg; + + rdl->notify_discard(rdl, s); + return 0; +} + +static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset, + uint64_t size) +{ + RamDiscardListener *rdl; + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + rdl->notify_discard(rdl, &tmp); + } +} + +static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, + uint64_t size) +{ + RamDiscardListener *rdl, *rdl2; + int ret = 0; + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + ret = rdl->notify_populate(rdl, &tmp); + if (ret) { + break; + } + } + + if (ret) { + /* Notify all already-notified listeners. */ + QLIST_FOREACH(rdl2, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (rdl2 == rdl) { + break; + } + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + rdl2->notify_discard(rdl2, &tmp); + } + } + return ret; +} + +static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem) +{ + RamDiscardListener *rdl; + + if (!vmem->size) { + return; + } + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + if (rdl->double_discard_supported) { + rdl->notify_discard(rdl, rdl->section); + } else { + virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_discard_cb); + } + } +} + +static bool virtio_mem_test_bitmap(const VirtIOMEM *vmem, uint64_t start_gpa, uint64_t size, bool plugged) { const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size; @@ -198,7 +364,8 @@ static void virtio_mem_send_response_simple(VirtIOMEM *vmem, virtio_mem_send_response(vmem, elem, &resp); } -static bool virtio_mem_valid_range(VirtIOMEM *vmem, uint64_t gpa, uint64_t size) +static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa, + uint64_t size) { if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) { return false; @@ -219,19 +386,21 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, uint64_t size, bool plug) { const uint64_t offset = start_gpa - vmem->addr; - int ret; + RAMBlock *rb = vmem->memdev->mr.ram_block; if (virtio_mem_is_busy()) { return -EBUSY; } if (!plug) { - ret = ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size); - if (ret) { - error_report("Unexpected error discarding RAM: %s", - strerror(-ret)); + if (ram_block_discard_range(rb, offset, size)) { return -EBUSY; } + virtio_mem_notify_unplug(vmem, offset, size); + } else if (virtio_mem_notify_plug(vmem, offset, size)) { + /* Could be a mapping attempt resulted in memory getting populated. */ + ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size); + return -EBUSY; } virtio_mem_set_bitmap(vmem, start_gpa, size, plug); return 0; @@ -318,17 +487,16 @@ static void virtio_mem_resize_usable_region(VirtIOMEM *vmem, static int virtio_mem_unplug_all(VirtIOMEM *vmem) { RAMBlock *rb = vmem->memdev->mr.ram_block; - int ret; if (virtio_mem_is_busy()) { return -EBUSY; } - ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); - if (ret) { - error_report("Unexpected error discarding RAM: %s", strerror(-ret)); + if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) { return -EBUSY; } + virtio_mem_notify_unplug_all(vmem); + bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size); if (vmem->size) { vmem->size = 0; @@ -551,7 +719,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) return; } - if (ram_block_discard_require(true)) { + if (ram_block_coordinated_discard_require(true)) { error_setg(errp, "Discarding RAM is disabled"); return; } @@ -559,7 +727,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); if (ret) { error_setg_errno(errp, -ret, "Unexpected error discarding RAM"); - ram_block_discard_require(false); + ram_block_coordinated_discard_require(false); return; } @@ -577,6 +745,13 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem)); qemu_register_reset(virtio_mem_system_reset, vmem); precopy_add_notifier(&vmem->precopy_notifier); + + /* + * Set ourselves as RamDiscardManager before the plug handler maps the + * memory region and exposes it via an address space. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, + RAM_DISCARD_MANAGER(vmem)); } static void virtio_mem_device_unrealize(DeviceState *dev) @@ -584,6 +759,11 @@ static void virtio_mem_device_unrealize(DeviceState *dev) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOMEM *vmem = VIRTIO_MEM(dev); + /* + * The unplug handler unmapped the memory region, it cannot be + * found via an address space anymore. Unset ourselves. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); precopy_remove_notifier(&vmem->precopy_notifier); qemu_unregister_reset(virtio_mem_system_reset, vmem); vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem)); @@ -591,43 +771,47 @@ static void virtio_mem_device_unrealize(DeviceState *dev) virtio_del_queue(vdev, 0); virtio_cleanup(vdev); g_free(vmem->bitmap); - ram_block_discard_require(false); + ram_block_coordinated_discard_require(false); } -static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) +static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size) { RAMBlock *rb = vmem->memdev->mr.ram_block; - unsigned long first_zero_bit, last_zero_bit; - uint64_t offset, length; - int ret; - /* Find consecutive unplugged blocks and discard the consecutive range. */ - first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); - while (first_zero_bit < vmem->bitmap_size) { - offset = first_zero_bit * vmem->block_size; - last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, - first_zero_bit + 1) - 1; - length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0; +} - ret = ram_block_discard_range(rb, offset, length); - if (ret) { - error_report("Unexpected error discarding RAM: %s", - strerror(-ret)); - return -EINVAL; - } - first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, - last_zero_bit + 2); - } - return 0; +static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) +{ + /* Make sure all memory is really discarded after migration. */ + return virtio_mem_for_each_unplugged_range(vmem, NULL, + virtio_mem_discard_range_cb); } static int virtio_mem_post_load(void *opaque, int version_id) { + VirtIOMEM *vmem = VIRTIO_MEM(opaque); + RamDiscardListener *rdl; + int ret; + + /* + * We started out with all memory discarded and our memory region is mapped + * into an address space. Replay, now that we updated the bitmap. + */ + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_populate_cb); + if (ret) { + return ret; + } + } + if (migration_in_incoming_postcopy()) { return 0; } - return virtio_mem_restore_unplugged(VIRTIO_MEM(opaque)); + return virtio_mem_restore_unplugged(vmem); } typedef struct VirtIOMEMMigSanityChecks { @@ -702,6 +886,7 @@ static const VMStateDescription vmstate_virtio_mem_device = { .name = "virtio-mem-device", .minimum_version_id = 1, .version_id = 1, + .priority = MIG_PRI_VIRTIO_MEM, .post_load = virtio_mem_post_load, .fields = (VMStateField[]) { VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks, @@ -872,28 +1057,19 @@ static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name, vmem->block_size = value; } -static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem) +static int virtio_mem_precopy_exclude_range_cb(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size) { void * const host = qemu_ram_get_host_addr(vmem->memdev->mr.ram_block); - unsigned long first_zero_bit, last_zero_bit; - uint64_t offset, length; - /* - * Find consecutive unplugged blocks and exclude them from migration. - * - * Note: Blocks cannot get (un)plugged during precopy, no locking needed. - */ - first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); - while (first_zero_bit < vmem->bitmap_size) { - offset = first_zero_bit * vmem->block_size; - last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, - first_zero_bit + 1) - 1; - length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + qemu_guest_free_page_hint(host + offset, size); + return 0; +} - qemu_guest_free_page_hint(host + offset, length); - first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, - last_zero_bit + 2); - } +static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem) +{ + virtio_mem_for_each_unplugged_range(vmem, NULL, + virtio_mem_precopy_exclude_range_cb); } static int virtio_mem_precopy_notify(NotifierWithReturn *n, void *data) @@ -918,6 +1094,7 @@ static void virtio_mem_instance_init(Object *obj) notifier_list_init(&vmem->size_change_notifiers); vmem->precopy_notifier.notify = virtio_mem_precopy_notify; + QLIST_INIT(&vmem->rdl_list); object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size, NULL, NULL, NULL); @@ -937,11 +1114,107 @@ static Property virtio_mem_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + + g_assert(mr == &vmem->memdev->mr); + return vmem->block_size; +} + +static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *s) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + uint64_t start_gpa = vmem->addr + s->offset_within_region; + uint64_t end_gpa = start_gpa + int128_get64(s->size); + + g_assert(s->mr == &vmem->memdev->mr); + + start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size); + end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size); + + if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) { + return false; + } + + return virtio_mem_test_bitmap(vmem, start_gpa, end_gpa - start_gpa, true); +} + +struct VirtIOMEMReplayData { + void *fn; + void *opaque; +}; + +static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg) +{ + struct VirtIOMEMReplayData *data = arg; + + return ((ReplayRamPopulate)data->fn)(s, data->opaque); +} + +static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *s, + ReplayRamPopulate replay_fn, + void *opaque) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + struct VirtIOMEMReplayData data = { + .fn = replay_fn, + .opaque = opaque, + }; + + g_assert(s->mr == &vmem->memdev->mr); + return virtio_mem_for_each_plugged_section(vmem, s, &data, + virtio_mem_rdm_replay_populated_cb); +} + +static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *s) +{ + VirtIOMEM *vmem = VIRTIO_MEM(rdm); + int ret; + + g_assert(s->mr == &vmem->memdev->mr); + rdl->section = memory_region_section_new_copy(s); + + QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next); + ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_populate_cb); + if (ret) { + error_report("%s: Replaying plugged ranges failed: %s", __func__, + strerror(-ret)); + } +} + +static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl) +{ + VirtIOMEM *vmem = VIRTIO_MEM(rdm); + + g_assert(rdl->section->mr == &vmem->memdev->mr); + if (vmem->size) { + if (rdl->double_discard_supported) { + rdl->notify_discard(rdl, rdl->section); + } else { + virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_discard_cb); + } + } + + memory_region_section_free_copy(rdl->section); + rdl->section = NULL; + QLIST_REMOVE(rdl, next); +} + static void virtio_mem_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass); + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass); device_class_set_props(dc, virtio_mem_properties); dc->vmsd = &vmstate_virtio_mem; @@ -957,6 +1230,12 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data) vmc->get_memory_region = virtio_mem_get_memory_region; vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier; vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier; + + rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity; + rdmc->is_populated = virtio_mem_rdm_is_populated; + rdmc->replay_populated = virtio_mem_rdm_replay_populated; + rdmc->register_listener = virtio_mem_rdm_register_listener; + rdmc->unregister_listener = virtio_mem_rdm_unregister_listener; } static const TypeInfo virtio_mem_info = { @@ -966,6 +1245,10 @@ static const TypeInfo virtio_mem_info = { .instance_init = virtio_mem_instance_init, .class_init = virtio_mem_class_init, .class_size = sizeof(VirtIOMEMClass), + .interfaces = (InterfaceInfo[]) { + { TYPE_RAM_DISCARD_MANAGER }, + { } + }, }; static void virtio_register_types(void) diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index 5952471..1af48a1 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -29,6 +29,7 @@ #include "qemu/host-utils.h" #include "qemu/module.h" #include "sysemu/kvm.h" +#include "sysemu/replay.h" #include "hw/virtio/virtio-mmio.h" #include "qemu/error-report.h" #include "qemu/log.h" @@ -740,6 +741,11 @@ static void virtio_mmio_realizefn(DeviceState *d, Error **errp) proxy->flags &= ~VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD; } + /* fd-based ioevents can't be synchronized in record/replay */ + if (replay_mode != REPLAY_MODE_NONE) { + proxy->flags &= ~VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD; + } + if (proxy->legacy) { memory_region_init_io(&proxy->iomem, OBJECT(d), &virtio_legacy_mem_ops, proxy, diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index b321604..433060a 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -37,6 +37,7 @@ #include "qemu/range.h" #include "hw/virtio/virtio-bus.h" #include "qapi/visitor.h" +#include "sysemu/replay.h" #define VIRTIO_PCI_REGION_SIZE(dev) VIRTIO_PCI_CONFIG_OFF(msix_present(dev)) @@ -423,6 +424,11 @@ static uint64_t virtio_pci_config_read(void *opaque, hwaddr addr, VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev); uint64_t val = 0; + + if (vdev == NULL) { + return UINT64_MAX; + } + if (addr < config) { return virtio_ioport_read(proxy, addr); } @@ -454,6 +460,11 @@ static void virtio_pci_config_write(void *opaque, hwaddr addr, VirtIOPCIProxy *proxy = opaque; uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev); VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (vdev == NULL) { + return; + } + if (addr < config) { virtio_ioport_write(proxy, addr, val); return; @@ -1146,6 +1157,10 @@ static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, uint32_t val = 0; int i; + if (vdev == NULL) { + return UINT64_MAX; + } + switch (addr) { case VIRTIO_PCI_COMMON_DFSELECT: val = proxy->dfselect; @@ -1229,6 +1244,10 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, VirtIOPCIProxy *proxy = opaque; VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + if (vdev == NULL) { + return; + } + switch (addr) { case VIRTIO_PCI_COMMON_DFSELECT: proxy->dfselect = val; @@ -1330,6 +1349,11 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, static uint64_t virtio_pci_notify_read(void *opaque, hwaddr addr, unsigned size) { + VirtIOPCIProxy *proxy = opaque; + if (virtio_bus_get_device(&proxy->bus) == NULL) { + return UINT64_MAX; + } + return 0; } @@ -1367,7 +1391,7 @@ static uint64_t virtio_pci_isr_read(void *opaque, hwaddr addr, uint64_t val; if (vdev == NULL) { - return 0; + return UINT64_MAX; } val = qatomic_xchg(&vdev->isr, 0); @@ -1388,7 +1412,7 @@ static uint64_t virtio_pci_device_read(void *opaque, hwaddr addr, uint64_t val; if (vdev == NULL) { - return 0; + return UINT64_MAX; } switch (size) { @@ -1760,6 +1784,11 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD; } + /* fd-based ioevents can't be synchronized in record/replay */ + if (replay_mode != REPLAY_MODE_NONE) { + proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD; + } + /* * virtio pci bar layout used by default. * subclasses can re-arrange things if needed. diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index ab516ac..6dcf3ba 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3728,6 +3728,10 @@ static int virtio_device_start_ioeventfd_impl(VirtIODevice *vdev) VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev))); int i, n, r, err; + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { VirtQueue *vq = &vdev->vq[n]; @@ -3766,6 +3770,10 @@ assign_error: r = virtio_bus_set_host_notifier(qbus, n, false); assert(r >= 0); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); while (--i >= 0) { @@ -3790,6 +3798,10 @@ static void virtio_device_stop_ioeventfd_impl(VirtIODevice *vdev) VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev))); int n, r; + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { VirtQueue *vq = &vdev->vq[n]; @@ -3801,6 +3813,10 @@ static void virtio_device_stop_ioeventfd_impl(VirtIODevice *vdev) r = virtio_bus_set_host_notifier(qbus, n, false); assert(r >= 0); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { diff --git a/include/block/aio.h b/include/block/aio.h index 10fcae1..807edce 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -292,19 +292,44 @@ void aio_context_acquire(AioContext *ctx); void aio_context_release(AioContext *ctx); /** + * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will + * run only once and as soon as possible. + * + * @name: A human-readable identifier for debugging purposes. + */ +void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, + const char *name); + +/** * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run * only once and as soon as possible. + * + * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the + * name string. */ -void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque); +#define aio_bh_schedule_oneshot(ctx, cb, opaque) \ + aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb))) /** - * aio_bh_new: Allocate a new bottom half structure. + * aio_bh_new_full: Allocate a new bottom half structure. * * Bottom halves are lightweight callbacks whose invocation is guaranteed * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure * is opaque and must be allocated prior to its use. + * + * @name: A human-readable identifier for debugging purposes. + */ +QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, + const char *name); + +/** + * aio_bh_new: Allocate a new bottom half structure + * + * A convenience wrapper for aio_bh_new_full() that uses the cb as the name + * string. */ -QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque); +#define aio_bh_new(ctx, cb, opaque) \ + aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb))) /** * aio_notify: Force processing of pending events. diff --git a/include/exec/memory.h b/include/exec/memory.h index b116f7c..c3d417d 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -42,6 +42,12 @@ typedef struct IOMMUMemoryRegionClass IOMMUMemoryRegionClass; DECLARE_OBJ_CHECKERS(IOMMUMemoryRegion, IOMMUMemoryRegionClass, IOMMU_MEMORY_REGION, TYPE_IOMMU_MEMORY_REGION) +#define TYPE_RAM_DISCARD_MANAGER "qemu:ram-discard-manager" +typedef struct RamDiscardManagerClass RamDiscardManagerClass; +typedef struct RamDiscardManager RamDiscardManager; +DECLARE_OBJ_CHECKERS(RamDiscardManager, RamDiscardManagerClass, + RAM_DISCARD_MANAGER, TYPE_RAM_DISCARD_MANAGER); + #ifdef CONFIG_FUZZ void fuzz_dma_read_cb(size_t addr, size_t len, @@ -65,6 +71,28 @@ struct ReservedRegion { unsigned type; }; +/** + * struct MemoryRegionSection: describes a fragment of a #MemoryRegion + * + * @mr: the region, or %NULL if empty + * @fv: the flat view of the address space the region is mapped in + * @offset_within_region: the beginning of the section, relative to @mr's start + * @size: the size of the section; will not exceed @mr's boundaries + * @offset_within_address_space: the address of the first byte of the section + * relative to the region's address space + * @readonly: writes to this section are ignored + * @nonvolatile: this section is non-volatile + */ +struct MemoryRegionSection { + Int128 size; + MemoryRegion *mr; + FlatView *fv; + hwaddr offset_within_region; + hwaddr offset_within_address_space; + bool readonly; + bool nonvolatile; +}; + typedef struct IOMMUTLBEntry IOMMUTLBEntry; /* See address_space_translate: bit 0 is read, bit 1 is write. */ @@ -448,6 +476,206 @@ struct IOMMUMemoryRegionClass { Error **errp); }; +typedef struct RamDiscardListener RamDiscardListener; +typedef int (*NotifyRamPopulate)(RamDiscardListener *rdl, + MemoryRegionSection *section); +typedef void (*NotifyRamDiscard)(RamDiscardListener *rdl, + MemoryRegionSection *section); + +struct RamDiscardListener { + /* + * @notify_populate: + * + * Notification that previously discarded memory is about to get populated. + * Listeners are able to object. If any listener objects, already + * successfully notified listeners are notified about a discard again. + * + * @rdl: the #RamDiscardListener getting notified + * @section: the #MemoryRegionSection to get populated. The section + * is aligned within the memory region to the minimum granularity + * unless it would exceed the registered section. + * + * Returns 0 on success. If the notification is rejected by the listener, + * an error is returned. + */ + NotifyRamPopulate notify_populate; + + /* + * @notify_discard: + * + * Notification that previously populated memory was discarded successfully + * and listeners should drop all references to such memory and prevent + * new population (e.g., unmap). + * + * @rdl: the #RamDiscardListener getting notified + * @section: the #MemoryRegionSection to get populated. The section + * is aligned within the memory region to the minimum granularity + * unless it would exceed the registered section. + */ + NotifyRamDiscard notify_discard; + + /* + * @double_discard_supported: + * + * The listener suppors getting @notify_discard notifications that span + * already discarded parts. + */ + bool double_discard_supported; + + MemoryRegionSection *section; + QLIST_ENTRY(RamDiscardListener) next; +}; + +static inline void ram_discard_listener_init(RamDiscardListener *rdl, + NotifyRamPopulate populate_fn, + NotifyRamDiscard discard_fn, + bool double_discard_supported) +{ + rdl->notify_populate = populate_fn; + rdl->notify_discard = discard_fn; + rdl->double_discard_supported = double_discard_supported; +} + +typedef int (*ReplayRamPopulate)(MemoryRegionSection *section, void *opaque); + +/* + * RamDiscardManagerClass: + * + * A #RamDiscardManager coordinates which parts of specific RAM #MemoryRegion + * regions are currently populated to be used/accessed by the VM, notifying + * after parts were discarded (freeing up memory) and before parts will be + * populated (consuming memory), to be used/acessed by the VM. + * + * A #RamDiscardManager can only be set for a RAM #MemoryRegion while the + * #MemoryRegion isn't mapped yet; it cannot change while the #MemoryRegion is + * mapped. + * + * The #RamDiscardManager is intended to be used by technologies that are + * incompatible with discarding of RAM (e.g., VFIO, which may pin all + * memory inside a #MemoryRegion), and require proper coordination to only + * map the currently populated parts, to hinder parts that are expected to + * remain discarded from silently getting populated and consuming memory. + * Technologies that support discarding of RAM don't have to bother and can + * simply map the whole #MemoryRegion. + * + * An example #RamDiscardManager is virtio-mem, which logically (un)plugs + * memory within an assigned RAM #MemoryRegion, coordinated with the VM. + * Logically unplugging memory consists of discarding RAM. The VM agreed to not + * access unplugged (discarded) memory - especially via DMA. virtio-mem will + * properly coordinate with listeners before memory is plugged (populated), + * and after memory is unplugged (discarded). + * + * Listeners are called in multiples of the minimum granularity (unless it + * would exceed the registered range) and changes are aligned to the minimum + * granularity within the #MemoryRegion. Listeners have to prepare for memory + * becomming discarded in a different granularity than it was populated and the + * other way around. + */ +struct RamDiscardManagerClass { + /* private */ + InterfaceClass parent_class; + + /* public */ + + /** + * @get_min_granularity: + * + * Get the minimum granularity in which listeners will get notified + * about changes within the #MemoryRegion via the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @mr: the #MemoryRegion + * + * Returns the minimum granularity. + */ + uint64_t (*get_min_granularity)(const RamDiscardManager *rdm, + const MemoryRegion *mr); + + /** + * @is_populated: + * + * Check whether the given #MemoryRegionSection is completely populated + * (i.e., no parts are currently discarded) via the #RamDiscardManager. + * There are no alignment requirements. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * + * Returns whether the given range is completely populated. + */ + bool (*is_populated)(const RamDiscardManager *rdm, + const MemoryRegionSection *section); + + /** + * @replay_populated: + * + * Call the #ReplayRamPopulate callback for all populated parts within the + * #MemoryRegionSection via the #RamDiscardManager. + * + * In case any call fails, no further calls are made. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamPopulate callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ + int (*replay_populated)(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamPopulate replay_fn, void *opaque); + + /** + * @register_listener: + * + * Register a #RamDiscardListener for the given #MemoryRegionSection and + * immediately notify the #RamDiscardListener about all populated parts + * within the #MemoryRegionSection via the #RamDiscardManager. + * + * In case any notification fails, no further notifications are triggered + * and an error is logged. + * + * @rdm: the #RamDiscardManager + * @rdl: the #RamDiscardListener + * @section: the #MemoryRegionSection + */ + void (*register_listener)(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *section); + + /** + * @unregister_listener: + * + * Unregister a previously registered #RamDiscardListener via the + * #RamDiscardManager after notifying the #RamDiscardListener about all + * populated parts becoming unpopulated within the registered + * #MemoryRegionSection. + * + * @rdm: the #RamDiscardManager + * @rdl: the #RamDiscardListener + */ + void (*unregister_listener)(RamDiscardManager *rdm, + RamDiscardListener *rdl); +}; + +uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr); + +bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *section); + +int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamPopulate replay_fn, + void *opaque); + +void ram_discard_manager_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *section); + +void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl); + typedef struct CoalescedMemoryRange CoalescedMemoryRange; typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; @@ -494,6 +722,7 @@ struct MemoryRegion { const char *name; unsigned ioeventfd_nb; MemoryRegionIoeventfd *ioeventfds; + RamDiscardManager *rdm; /* Only for RAM */ }; struct IOMMUMemoryRegion { @@ -825,28 +1054,6 @@ typedef bool (*flatview_cb)(Int128 start, */ void flatview_for_each_range(FlatView *fv, flatview_cb cb, void *opaque); -/** - * struct MemoryRegionSection: describes a fragment of a #MemoryRegion - * - * @mr: the region, or %NULL if empty - * @fv: the flat view of the address space the region is mapped in - * @offset_within_region: the beginning of the section, relative to @mr's start - * @size: the size of the section; will not exceed @mr's boundaries - * @offset_within_address_space: the address of the first byte of the section - * relative to the region's address space - * @readonly: writes to this section are ignored - * @nonvolatile: this section is non-volatile - */ -struct MemoryRegionSection { - Int128 size; - MemoryRegion *mr; - FlatView *fv; - hwaddr offset_within_region; - hwaddr offset_within_address_space; - bool readonly; - bool nonvolatile; -}; - static inline bool MemoryRegionSection_eq(MemoryRegionSection *a, MemoryRegionSection *b) { @@ -860,6 +1067,26 @@ static inline bool MemoryRegionSection_eq(MemoryRegionSection *a, } /** + * memory_region_section_new_copy: Copy a memory region section + * + * Allocate memory for a new copy, copy the memory region section, and + * properly take a reference on all relevant members. + * + * @s: the #MemoryRegionSection to copy + */ +MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s); + +/** + * memory_region_section_new_copy: Free a copied memory region section + * + * Free a copy of a memory section created via memory_region_section_new_copy(). + * properly dropping references on all relevant members. + * + * @s: the #MemoryRegionSection to copy + */ +void memory_region_section_free_copy(MemoryRegionSection *s); + +/** * memory_region_init: Initialize a memory region * * The region typically acts as a container for other memory regions. Use @@ -2024,6 +2251,41 @@ bool memory_region_present(MemoryRegion *container, hwaddr addr); bool memory_region_is_mapped(MemoryRegion *mr); /** + * memory_region_get_ram_discard_manager: get the #RamDiscardManager for a + * #MemoryRegion + * + * The #RamDiscardManager cannot change while a memory region is mapped. + * + * @mr: the #MemoryRegion + */ +RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr); + +/** + * memory_region_has_ram_discard_manager: check whether a #MemoryRegion has a + * #RamDiscardManager assigned + * + * @mr: the #MemoryRegion + */ +static inline bool memory_region_has_ram_discard_manager(MemoryRegion *mr) +{ + return !!memory_region_get_ram_discard_manager(mr); +} + +/** + * memory_region_set_ram_discard_manager: set the #RamDiscardManager for a + * #MemoryRegion + * + * This function must not be called for a mapped #MemoryRegion, a #MemoryRegion + * that does not cover RAM, or a #MemoryRegion that already has a + * #RamDiscardManager assigned. + * + * @mr: the #MemoryRegion + * @rdm: #RamDiscardManager to set + */ +void memory_region_set_ram_discard_manager(MemoryRegion *mr, + RamDiscardManager *rdm); + +/** * memory_region_find: translate an address/size relative to a * MemoryRegion into a #MemoryRegionSection. * @@ -2632,6 +2894,12 @@ static inline MemOp devend_memop(enum device_endian end) int ram_block_discard_disable(bool state); /* + * See ram_block_discard_disable(): only disable uncoordinated discards, + * keeping coordinated discards (via the RamDiscardManager) enabled. + */ +int ram_block_uncoordinated_discard_disable(bool state); + +/* * Inhibit technologies that disable discarding of pages in RAM blocks. * * Returns 0 if successful. Returns -EBUSY if discards are already set to @@ -2640,12 +2908,20 @@ int ram_block_discard_disable(bool state); int ram_block_discard_require(bool state); /* - * Test if discarding of memory in ram blocks is disabled. + * See ram_block_discard_require(): only inhibit technologies that disable + * uncoordinated discarding of pages in RAM blocks, allowing co-existance with + * technologies that only inhibit uncoordinated discards (via the + * RamDiscardManager). + */ +int ram_block_coordinated_discard_require(bool state); + +/* + * Test if any discarding of memory in ram blocks is disabled. */ bool ram_block_discard_is_disabled(void); /* - * Test if discarding of memory in ram blocks is required to work reliably. + * Test if any discarding of memory in ram blocks is required to work reliably. */ bool ram_block_discard_is_required(void); diff --git a/include/hw/block/block.h b/include/hw/block/block.h index c172cbe..5902c04 100644 --- a/include/hw/block/block.h +++ b/include/hw/block/block.h @@ -19,6 +19,7 @@ typedef struct BlockConf { BlockBackend *blk; + OnOffAuto backend_defaults; uint32_t physical_block_size; uint32_t logical_block_size; uint32_t min_io_size; @@ -48,6 +49,8 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf) } #define DEFINE_BLOCK_PROPERTIES_BASE(_state, _conf) \ + DEFINE_PROP_ON_OFF_AUTO("backend_defaults", _state, \ + _conf.backend_defaults, ON_OFF_AUTO_AUTO), \ DEFINE_PROP_BLOCKSIZE("logical_block_size", _state, \ _conf.logical_block_size), \ DEFINE_PROP_BLOCKSIZE("physical_block_size", _state, \ diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index f05219f..637652a 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -12,6 +12,7 @@ #include "hw/ppc/spapr_xive.h" /* For SpaprXive */ #include "hw/ppc/xics.h" /* For ICSState */ #include "hw/ppc/spapr_tpm_proxy.h" +#include "hw/ppc/vof.h" struct SpaprVioBus; struct SpaprPhbState; @@ -74,8 +75,10 @@ typedef enum { #define SPAPR_CAP_CCF_ASSIST 0x09 /* Implements PAPR FWNMI option */ #define SPAPR_CAP_FWNMI 0x0A +/* Support H_RPT_INVALIDATE */ +#define SPAPR_CAP_RPT_INVALIDATE 0x0B /* Num Caps */ -#define SPAPR_CAP_NUM (SPAPR_CAP_FWNMI + 1) +#define SPAPR_CAP_NUM (SPAPR_CAP_RPT_INVALIDATE + 1) /* * Capability Values @@ -180,6 +183,7 @@ struct SpaprMachineState { uint64_t kernel_addr; uint32_t initrd_base; long initrd_size; + Vof *vof; uint64_t rtc_offset; /* Now used only during incoming migration */ struct PPCTimebase tb; bool has_graphics; @@ -398,10 +402,13 @@ struct SpaprMachineState { #define H_CPU_CHAR_THR_RECONF_TRIG PPC_BIT(6) #define H_CPU_CHAR_CACHE_COUNT_DIS PPC_BIT(7) #define H_CPU_CHAR_BCCTR_FLUSH_ASSIST PPC_BIT(9) + #define H_CPU_BEHAV_FAVOUR_SECURITY PPC_BIT(0) #define H_CPU_BEHAV_L1D_FLUSH_PR PPC_BIT(1) #define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR PPC_BIT(2) #define H_CPU_BEHAV_FLUSH_COUNT_CACHE PPC_BIT(5) +#define H_CPU_BEHAV_NO_L1D_FLUSH_ENTRY PPC_BIT(7) +#define H_CPU_BEHAV_NO_L1D_FLUSH_UACCESS PPC_BIT(8) /* Each control block has to be on a 4K boundary */ #define H_CB_ALIGNMENT 4096 @@ -542,8 +549,9 @@ struct SpaprMachineState { #define H_SCM_UNBIND_MEM 0x3F0 #define H_SCM_UNBIND_ALL 0x3FC #define H_SCM_HEALTH 0x400 +#define H_RPT_INVALIDATE 0x448 -#define MAX_HCALL_OPCODE H_SCM_HEALTH +#define MAX_HCALL_OPCODE H_RPT_INVALIDATE /* The hcalls above are standardized in PAPR and implemented by pHyp * as well. @@ -558,7 +566,9 @@ struct SpaprMachineState { /* Client Architecture support */ #define KVMPPC_H_CAS (KVMPPC_HCALL_BASE + 0x2) #define KVMPPC_H_UPDATE_DT (KVMPPC_HCALL_BASE + 0x3) -#define KVMPPC_HCALL_MAX KVMPPC_H_UPDATE_DT +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */ +#define KVMPPC_H_VOF_CLIENT (KVMPPC_HCALL_BASE + 0x5) +#define KVMPPC_HCALL_MAX KVMPPC_H_VOF_CLIENT /* * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating @@ -770,7 +780,7 @@ void spapr_load_rtas(SpaprMachineState *spapr, void *fdt, hwaddr addr); #define SPAPR_IS_PCI_LIOBN(liobn) (!!((liobn) & 0x80000000)) #define SPAPR_PCI_DMA_WINDOW_NUM(liobn) ((liobn) & 0xff) -#define RTAS_SIZE 2048 +#define RTAS_MIN_SIZE 20 /* hv_rtas_size in SLOF */ #define RTAS_ERROR_LOG_MAX 2048 /* Offset from rtas-base where error log is placed */ @@ -932,6 +942,7 @@ extern const VMStateDescription vmstate_spapr_cap_nested_kvm_hv; extern const VMStateDescription vmstate_spapr_cap_large_decr; extern const VMStateDescription vmstate_spapr_cap_ccf_assist; extern const VMStateDescription vmstate_spapr_cap_fwnmi; +extern const VMStateDescription vmstate_spapr_cap_rpt_invalidate; static inline uint8_t spapr_get_cap(SpaprMachineState *spapr, int cap) { @@ -956,4 +967,16 @@ bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, void spapr_set_all_lpcrs(target_ulong value, target_ulong mask); hwaddr spapr_get_rtas_addr(void); bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr); + +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt, Error **errp); +void spapr_vof_quiesce(MachineState *ms); +bool spapr_vof_setprop(MachineState *ms, const char *path, const char *propname, + void *val, int vallen); +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr, + target_ulong opcode, target_ulong *args); +target_ulong spapr_vof_client_architecture_support(MachineState *ms, + CPUState *cs, + target_ulong ovec_addr); +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt); + #endif /* HW_SPAPR_H */ diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h new file mode 100644 index 0000000..640be46 --- /dev/null +++ b/include/hw/ppc/vof.h @@ -0,0 +1,58 @@ +/* + * Virtual Open Firmware + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#ifndef HW_VOF_H +#define HW_VOF_H + +typedef struct Vof { + uint64_t top_addr; /* copied from rma_size */ + GArray *claimed; /* array of SpaprOfClaimed */ + uint64_t claimed_base; + GHashTable *of_instances; /* ihandle -> SpaprOfInstance */ + uint32_t of_instance_last; + char *bootargs; + long fw_size; +} Vof; + +int vof_client_call(MachineState *ms, Vof *vof, void *fdt, + target_ulong args_real); +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t align); +void vof_init(Vof *vof, uint64_t top_addr, Error **errp); +void vof_cleanup(Vof *vof); +void vof_build_dt(void *fdt, Vof *vof); +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename, + const char *prop, const char *path); + +#define TYPE_VOF_MACHINE_IF "vof-machine-if" + +typedef struct VofMachineIfClass VofMachineIfClass; +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, TYPE_VOF_MACHINE_IF) + +struct VofMachineIfClass { + InterfaceClass parent; + target_ulong (*client_architecture_support)(MachineState *ms, CPUState *cs, + target_ulong vec); + void (*quiesce)(MachineState *ms); + bool (*setprop)(MachineState *ms, const char *path, const char *propname, + void *val, int vallen); +}; + +/* + * Initial stack size is from + * https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html#REF27292 + * + * "Client programs shall be invoked with a valid stack pointer (r1) with + * at least 32K bytes of memory available for stack growth". + */ +#define VOF_STACK_SIZE 0x8000 + +#define VOF_MEM_READ(pa, buf, size) \ + address_space_read(&address_space_memory, \ + (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size)) +#define VOF_MEM_WRITE(pa, buf, size) \ + address_space_write(&address_space_memory, \ + (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size)) + +#endif /* HW_VOF_H */ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 6141162..8af11b0 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -88,9 +88,11 @@ typedef struct VFIOContainer { uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; unsigned long pgsizes; + unsigned int dma_max_mappings; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_ENTRY(VFIOContainer) next; } VFIOContainer; @@ -102,6 +104,16 @@ typedef struct VFIOGuestIOMMU { QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; } VFIOGuestIOMMU; +typedef struct VFIORamDiscardListener { + VFIOContainer *container; + MemoryRegion *mr; + hwaddr offset_within_address_space; + hwaddr size; + uint64_t granularity; + RamDiscardListener listener; + QLIST_ENTRY(VFIORamDiscardListener) next; +} VFIORamDiscardListener; + typedef struct VFIOHostDMAWindow { hwaddr min_iova; hwaddr max_iova; diff --git a/include/hw/virtio/virtio-mem.h b/include/hw/virtio/virtio-mem.h index 4eeb82d..9a6e348 100644 --- a/include/hw/virtio/virtio-mem.h +++ b/include/hw/virtio/virtio-mem.h @@ -67,6 +67,9 @@ struct VirtIOMEM { /* don't migrate unplugged memory */ NotifierWithReturn precopy_notifier; + + /* listeners to notify on plug/unplug activity. */ + QLIST_HEAD(, RamDiscardListener) rdl_list; }; struct VirtIOMEMClass { diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 8df7b69..017c036 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -153,6 +153,7 @@ typedef enum { MIG_PRI_DEFAULT = 0, MIG_PRI_IOMMU, /* Must happen before PCI devices */ MIG_PRI_PCI_BUS, /* Must happen before IOMMU */ + MIG_PRI_VIRTIO_MEM, /* Must happen before IOMMU */ MIG_PRI_GICV3_ITS, /* Must happen before PCI devices */ MIG_PRI_GICV3, /* Must happen before the ITS */ MIG_PRI_MAX, diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h index 98aef56..8dbc6fc 100644 --- a/include/qemu/main-loop.h +++ b/include/qemu/main-loop.h @@ -294,7 +294,9 @@ void qemu_cond_timedwait_iothread(QemuCond *cond, int ms); void qemu_fd_register(int fd); -QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque); +#define qemu_bh_new(cb, opaque) \ + qemu_bh_new_full((cb), (opaque), (stringify(cb))) +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name); void qemu_bh_schedule_idle(QEMUBH *bh); enum { diff --git a/include/standard-headers/asm-x86/kvm_para.h b/include/standard-headers/asm-x86/kvm_para.h index 215d01b..204cfb8 100644 --- a/include/standard-headers/asm-x86/kvm_para.h +++ b/include/standard-headers/asm-x86/kvm_para.h @@ -33,6 +33,8 @@ #define KVM_FEATURE_PV_SCHED_YIELD 13 #define KVM_FEATURE_ASYNC_PF_INT 14 #define KVM_FEATURE_MSI_EXT_DEST_ID 15 +#define KVM_FEATURE_HC_MAP_GPA_RANGE 16 +#define KVM_FEATURE_MIGRATION_CONTROL 17 #define KVM_HINTS_REALTIME 0 @@ -54,6 +56,7 @@ #define MSR_KVM_POLL_CONTROL 0x4b564d05 #define MSR_KVM_ASYNC_PF_INT 0x4b564d06 #define MSR_KVM_ASYNC_PF_ACK 0x4b564d07 +#define MSR_KVM_MIGRATION_CONTROL 0x4b564d08 struct kvm_steal_time { uint64_t steal; @@ -90,6 +93,16 @@ struct kvm_clock_pairing { /* MSR_KVM_ASYNC_PF_INT */ #define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) +/* MSR_KVM_MIGRATION_CONTROL */ +#define KVM_MIGRATION_READY (1 << 0) + +/* KVM_HC_MAP_GPA_RANGE */ +#define KVM_MAP_GPA_RANGE_PAGE_SZ_4K 0 +#define KVM_MAP_GPA_RANGE_PAGE_SZ_2M (1 << 0) +#define KVM_MAP_GPA_RANGE_PAGE_SZ_1G (1 << 1) +#define KVM_MAP_GPA_RANGE_ENC_STAT(n) (n << 4) +#define KVM_MAP_GPA_RANGE_ENCRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(1) +#define KVM_MAP_GPA_RANGE_DECRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(0) /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h index a61ae52..352b51f 100644 --- a/include/standard-headers/drm/drm_fourcc.h +++ b/include/standard-headers/drm/drm_fourcc.h @@ -167,6 +167,13 @@ extern "C" { #define DRM_FORMAT_RGBA1010102 fourcc_code('R', 'A', '3', '0') /* [31:0] R:G:B:A 10:10:10:2 little endian */ #define DRM_FORMAT_BGRA1010102 fourcc_code('B', 'A', '3', '0') /* [31:0] B:G:R:A 10:10:10:2 little endian */ +/* 64 bpp RGB */ +#define DRM_FORMAT_XRGB16161616 fourcc_code('X', 'R', '4', '8') /* [63:0] x:R:G:B 16:16:16:16 little endian */ +#define DRM_FORMAT_XBGR16161616 fourcc_code('X', 'B', '4', '8') /* [63:0] x:B:G:R 16:16:16:16 little endian */ + +#define DRM_FORMAT_ARGB16161616 fourcc_code('A', 'R', '4', '8') /* [63:0] A:R:G:B 16:16:16:16 little endian */ +#define DRM_FORMAT_ABGR16161616 fourcc_code('A', 'B', '4', '8') /* [63:0] A:B:G:R 16:16:16:16 little endian */ + /* * Floating point 64bpp RGB * IEEE 754-2008 binary16 half-precision float diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h index 218d944..053d3fa 100644 --- a/include/standard-headers/linux/ethtool.h +++ b/include/standard-headers/linux/ethtool.h @@ -233,7 +233,7 @@ enum tunable_id { ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ /* * Add your fresh new tunable attribute above and remember to update - * tunable_strings[] in net/core/ethtool.c + * tunable_strings[] in net/ethtool/common.c */ __ETHTOOL_TUNABLE_COUNT, }; @@ -297,7 +297,7 @@ enum phy_tunable_id { ETHTOOL_PHY_EDPD, /* * Add your fresh new phy tunable attribute above and remember to update - * phy_tunable_strings[] in net/core/ethtool.c + * phy_tunable_strings[] in net/ethtool/common.c */ __ETHTOOL_PHY_TUNABLE_COUNT, }; diff --git a/include/standard-headers/linux/input-event-codes.h b/include/standard-headers/linux/input-event-codes.h index c403b9c..b5e86b4 100644 --- a/include/standard-headers/linux/input-event-codes.h +++ b/include/standard-headers/linux/input-event-codes.h @@ -611,6 +611,7 @@ #define KEY_VOICECOMMAND 0x246 /* Listening Voice Command */ #define KEY_ASSISTANT 0x247 /* AL Context-aware desktop assistant */ #define KEY_KBD_LAYOUT_NEXT 0x248 /* AC Next Keyboard Layout Select */ +#define KEY_EMOJI_PICKER 0x249 /* Show/hide emoji picker (HUTRR101) */ #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ diff --git a/include/standard-headers/linux/virtio_ids.h b/include/standard-headers/linux/virtio_ids.h index f0c35ce..4fe842c 100644 --- a/include/standard-headers/linux/virtio_ids.h +++ b/include/standard-headers/linux/virtio_ids.h @@ -54,7 +54,7 @@ #define VIRTIO_ID_SOUND 25 /* virtio sound */ #define VIRTIO_ID_FS 26 /* virtio filesystem */ #define VIRTIO_ID_PMEM 27 /* virtio pmem */ -#define VIRTIO_ID_BT 28 /* virtio bluetooth */ #define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ +#define VIRTIO_ID_BT 40 /* virtio bluetooth */ #endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/include/standard-headers/linux/virtio_vsock.h b/include/standard-headers/linux/virtio_vsock.h index be44321..3a23488 100644 --- a/include/standard-headers/linux/virtio_vsock.h +++ b/include/standard-headers/linux/virtio_vsock.h @@ -38,6 +38,9 @@ #include "standard-headers/linux/virtio_ids.h" #include "standard-headers/linux/virtio_config.h" +/* The feature bitmap for virtio vsock */ +#define VIRTIO_VSOCK_F_SEQPACKET 1 /* SOCK_SEQPACKET supported */ + struct virtio_vsock_config { uint64_t guest_cid; } QEMU_PACKED; @@ -65,6 +68,7 @@ struct virtio_vsock_hdr { enum virtio_vsock_type { VIRTIO_VSOCK_TYPE_STREAM = 1, + VIRTIO_VSOCK_TYPE_SEQPACKET = 2, }; enum virtio_vsock_op { @@ -91,4 +95,9 @@ enum virtio_vsock_shutdown { VIRTIO_VSOCK_SHUTDOWN_SEND = 2, }; +/* VIRTIO_VSOCK_OP_RW flags values */ +enum virtio_vsock_rw { + VIRTIO_VSOCK_SEQ_EOR = 1, +}; + #endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index b6a0eaa..3d2ce99 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -184,6 +184,17 @@ struct kvm_vcpu_events { __u32 reserved[12]; }; +struct kvm_arm_copy_mte_tags { + __u64 guest_ipa; + __u64 length; + void *addr; + __u64 flags; + __u64 reserved[2]; +}; + +#define KVM_ARM_TAGS_TO_GUEST 0 +#define KVM_ARM_TAGS_FROM_GUEST 1 + /* If you need to interpret the index values, here is the key: */ #define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 #define KVM_REG_ARM_COPROC_SHIFT 16 diff --git a/linux-headers/asm-generic/mman-common.h b/linux-headers/asm-generic/mman-common.h index f94f65d..1567a32 100644 --- a/linux-headers/asm-generic/mman-common.h +++ b/linux-headers/asm-generic/mman-common.h @@ -72,6 +72,9 @@ #define MADV_COLD 20 /* deactivate these pages */ #define MADV_PAGEOUT 21 /* reclaim these pages */ +#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ +#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h index 6de5a7f..f211961 100644 --- a/linux-headers/asm-generic/unistd.h +++ b/linux-headers/asm-generic/unistd.h @@ -863,8 +863,8 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise) __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) #define __NR_mount_setattr 442 __SYSCALL(__NR_mount_setattr, sys_mount_setattr) -#define __NR_quotactl_path 443 -__SYSCALL(__NR_quotactl_path, sys_quotactl_path) +#define __NR_quotactl_fd 443 +__SYSCALL(__NR_quotactl_fd, sys_quotactl_fd) #define __NR_landlock_create_ruleset 444 __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) diff --git a/linux-headers/asm-mips/mman.h b/linux-headers/asm-mips/mman.h index 57dc2ac..40b210c 100644 --- a/linux-headers/asm-mips/mman.h +++ b/linux-headers/asm-mips/mman.h @@ -98,6 +98,9 @@ #define MADV_COLD 20 /* deactivate these pages */ #define MADV_PAGEOUT 21 /* reclaim these pages */ +#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ +#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h index fce51fe..09cd297 100644 --- a/linux-headers/asm-mips/unistd_n32.h +++ b/linux-headers/asm-mips/unistd_n32.h @@ -372,6 +372,7 @@ #define __NR_process_madvise (__NR_Linux + 440) #define __NR_epoll_pwait2 (__NR_Linux + 441) #define __NR_mount_setattr (__NR_Linux + 442) +#define __NR_quotactl_fd (__NR_Linux + 443) #define __NR_landlock_create_ruleset (__NR_Linux + 444) #define __NR_landlock_add_rule (__NR_Linux + 445) #define __NR_landlock_restrict_self (__NR_Linux + 446) diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h index 0996001..780e0ce 100644 --- a/linux-headers/asm-mips/unistd_n64.h +++ b/linux-headers/asm-mips/unistd_n64.h @@ -348,6 +348,7 @@ #define __NR_process_madvise (__NR_Linux + 440) #define __NR_epoll_pwait2 (__NR_Linux + 441) #define __NR_mount_setattr (__NR_Linux + 442) +#define __NR_quotactl_fd (__NR_Linux + 443) #define __NR_landlock_create_ruleset (__NR_Linux + 444) #define __NR_landlock_add_rule (__NR_Linux + 445) #define __NR_landlock_restrict_self (__NR_Linux + 446) diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h index 954303a..06a2b3b 100644 --- a/linux-headers/asm-mips/unistd_o32.h +++ b/linux-headers/asm-mips/unistd_o32.h @@ -418,6 +418,7 @@ #define __NR_process_madvise (__NR_Linux + 440) #define __NR_epoll_pwait2 (__NR_Linux + 441) #define __NR_mount_setattr (__NR_Linux + 442) +#define __NR_quotactl_fd (__NR_Linux + 443) #define __NR_landlock_create_ruleset (__NR_Linux + 444) #define __NR_landlock_add_rule (__NR_Linux + 445) #define __NR_landlock_restrict_self (__NR_Linux + 446) diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h index 9155778..cd5a8a4 100644 --- a/linux-headers/asm-powerpc/unistd_32.h +++ b/linux-headers/asm-powerpc/unistd_32.h @@ -425,6 +425,7 @@ #define __NR_process_madvise 440 #define __NR_epoll_pwait2 441 #define __NR_mount_setattr 442 +#define __NR_quotactl_fd 443 #define __NR_landlock_create_ruleset 444 #define __NR_landlock_add_rule 445 #define __NR_landlock_restrict_self 446 diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h index 3cefa88..8458eff 100644 --- a/linux-headers/asm-powerpc/unistd_64.h +++ b/linux-headers/asm-powerpc/unistd_64.h @@ -397,6 +397,7 @@ #define __NR_process_madvise 440 #define __NR_epoll_pwait2 441 #define __NR_mount_setattr 442 +#define __NR_quotactl_fd 443 #define __NR_landlock_create_ruleset 444 #define __NR_landlock_add_rule 445 #define __NR_landlock_restrict_self 446 diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h index e8cd343..0c3cd29 100644 --- a/linux-headers/asm-s390/unistd_32.h +++ b/linux-headers/asm-s390/unistd_32.h @@ -415,6 +415,7 @@ #define __NR_process_madvise 440 #define __NR_epoll_pwait2 441 #define __NR_mount_setattr 442 +#define __NR_quotactl_fd 443 #define __NR_landlock_create_ruleset 444 #define __NR_landlock_add_rule 445 #define __NR_landlock_restrict_self 446 diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h index 86830e1..8dfc08b 100644 --- a/linux-headers/asm-s390/unistd_64.h +++ b/linux-headers/asm-s390/unistd_64.h @@ -363,6 +363,7 @@ #define __NR_process_madvise 440 #define __NR_epoll_pwait2 441 #define __NR_mount_setattr 442 +#define __NR_quotactl_fd 443 #define __NR_landlock_create_ruleset 444 #define __NR_landlock_add_rule 445 #define __NR_landlock_restrict_self 446 diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h index 0662f64..a6c327f 100644 --- a/linux-headers/asm-x86/kvm.h +++ b/linux-headers/asm-x86/kvm.h @@ -159,6 +159,19 @@ struct kvm_sregs { __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; +struct kvm_sregs2 { + /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 flags; + __u64 pdptrs[4]; +}; +#define KVM_SREGS2_FLAGS_PDPTRS_VALID 1 + /* for KVM_GET_FPU and KVM_SET_FPU */ struct kvm_fpu { __u8 fpr[8][16]; diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h index 8f6ac8c..66e96c0 100644 --- a/linux-headers/asm-x86/unistd_32.h +++ b/linux-headers/asm-x86/unistd_32.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_UNISTD_32_H -#define _ASM_X86_UNISTD_32_H 1 +#ifndef _ASM_UNISTD_32_H +#define _ASM_UNISTD_32_H #define __NR_restart_syscall 0 #define __NR_exit 1 @@ -433,9 +433,10 @@ #define __NR_process_madvise 440 #define __NR_epoll_pwait2 441 #define __NR_mount_setattr 442 +#define __NR_quotactl_fd 443 #define __NR_landlock_create_ruleset 444 #define __NR_landlock_add_rule 445 #define __NR_landlock_restrict_self 446 -#endif /* _ASM_X86_UNISTD_32_H */ +#endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h index bb187a9..b8ff6f1 100644 --- a/linux-headers/asm-x86/unistd_64.h +++ b/linux-headers/asm-x86/unistd_64.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_UNISTD_64_H -#define _ASM_X86_UNISTD_64_H 1 +#ifndef _ASM_UNISTD_64_H +#define _ASM_UNISTD_64_H #define __NR_read 0 #define __NR_write 1 @@ -355,9 +355,10 @@ #define __NR_process_madvise 440 #define __NR_epoll_pwait2 441 #define __NR_mount_setattr 442 +#define __NR_quotactl_fd 443 #define __NR_landlock_create_ruleset 444 #define __NR_landlock_add_rule 445 #define __NR_landlock_restrict_self 446 -#endif /* _ASM_X86_UNISTD_64_H */ +#endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h index 4edd010..06a1097 100644 --- a/linux-headers/asm-x86/unistd_x32.h +++ b/linux-headers/asm-x86/unistd_x32.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_UNISTD_X32_H -#define _ASM_X86_UNISTD_X32_H 1 +#ifndef _ASM_UNISTD_X32_H +#define _ASM_UNISTD_X32_H #define __NR_read (__X32_SYSCALL_BIT + 0) #define __NR_write (__X32_SYSCALL_BIT + 1) @@ -308,6 +308,7 @@ #define __NR_process_madvise (__X32_SYSCALL_BIT + 440) #define __NR_epoll_pwait2 (__X32_SYSCALL_BIT + 441) #define __NR_mount_setattr (__X32_SYSCALL_BIT + 442) +#define __NR_quotactl_fd (__X32_SYSCALL_BIT + 443) #define __NR_landlock_create_ruleset (__X32_SYSCALL_BIT + 444) #define __NR_landlock_add_rule (__X32_SYSCALL_BIT + 445) #define __NR_landlock_restrict_self (__X32_SYSCALL_BIT + 446) @@ -349,4 +350,4 @@ #define __NR_pwritev2 (__X32_SYSCALL_BIT + 547) -#endif /* _ASM_X86_UNISTD_X32_H */ +#endif /* _ASM_UNISTD_X32_H */ diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 20d6a26..bcaf66c 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -280,6 +280,9 @@ struct kvm_xen_exit { /* Encounter unexpected vm-exit reason */ #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON 4 +/* Flags that describe what fields in emulation_failure hold valid data. */ +#define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0) + /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ @@ -383,6 +386,25 @@ struct kvm_run { __u32 ndata; __u64 data[16]; } internal; + /* + * KVM_INTERNAL_ERROR_EMULATION + * + * "struct emulation_failure" is an overlay of "struct internal" + * that is used for the KVM_INTERNAL_ERROR_EMULATION sub-type of + * KVM_EXIT_INTERNAL_ERROR. Note, unlike other internal error + * sub-types, this struct is ABI! It also needs to be backwards + * compatible with "struct internal". Take special care that + * "ndata" is correct, that new fields are enumerated in "flags", + * and that each flag enumerates fields that are 64-bit aligned + * and sized (so that ndata+internal.data[] is valid/accurate). + */ + struct { + __u32 suberror; + __u32 ndata; + __u64 flags; + __u8 insn_size; + __u8 insn_bytes[15]; + } emulation_failure; /* KVM_EXIT_OSI */ struct { __u64 gprs[32]; @@ -1083,6 +1105,13 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SGX_ATTRIBUTE 196 #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197 #define KVM_CAP_PTP_KVM 198 +#define KVM_CAP_HYPERV_ENFORCE_CPUID 199 +#define KVM_CAP_SREGS2 200 +#define KVM_CAP_EXIT_HYPERCALL 201 +#define KVM_CAP_PPC_RPT_INVALIDATE 202 +#define KVM_CAP_BINARY_STATS_FD 203 +#define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 +#define KVM_CAP_ARM_MTE 205 #ifdef KVM_CAP_IRQ_ROUTING @@ -1428,6 +1457,7 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_PMU_EVENT_FILTER */ #define KVM_SET_PMU_EVENT_FILTER _IOW(KVMIO, 0xb2, struct kvm_pmu_event_filter) #define KVM_PPC_SVM_OFF _IO(KVMIO, 0xb3) +#define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) @@ -1621,6 +1651,9 @@ struct kvm_xen_hvm_attr { #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) +#define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) +#define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) + struct kvm_xen_vcpu_attr { __u16 type; __u16 pad[3]; @@ -1899,4 +1932,76 @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) +/** + * struct kvm_stats_header - Header of per vm/vcpu binary statistics data. + * @flags: Some extra information for header, always 0 for now. + * @name_size: The size in bytes of the memory which contains statistics + * name string including trailing '\0'. The memory is allocated + * at the send of statistics descriptor. + * @num_desc: The number of statistics the vm or vcpu has. + * @id_offset: The offset of the vm/vcpu stats' id string in the file pointed + * by vm/vcpu stats fd. + * @desc_offset: The offset of the vm/vcpu stats' descriptor block in the file + * pointd by vm/vcpu stats fd. + * @data_offset: The offset of the vm/vcpu stats' data block in the file + * pointed by vm/vcpu stats fd. + * + * This is the header userspace needs to read from stats fd before any other + * readings. It is used by userspace to discover all the information about the + * vm/vcpu's binary statistics. + * Userspace reads this header from the start of the vm/vcpu's stats fd. + */ +struct kvm_stats_header { + __u32 flags; + __u32 name_size; + __u32 num_desc; + __u32 id_offset; + __u32 desc_offset; + __u32 data_offset; +}; + +#define KVM_STATS_TYPE_SHIFT 0 +#define KVM_STATS_TYPE_MASK (0xF << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_PEAK + +#define KVM_STATS_UNIT_SHIFT 4 +#define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_NONE (0x0 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES + +#define KVM_STATS_BASE_SHIFT 8 +#define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT) +#define KVM_STATS_BASE_POW10 (0x0 << KVM_STATS_BASE_SHIFT) +#define KVM_STATS_BASE_POW2 (0x1 << KVM_STATS_BASE_SHIFT) +#define KVM_STATS_BASE_MAX KVM_STATS_BASE_POW2 + +/** + * struct kvm_stats_desc - Descriptor of a KVM statistics. + * @flags: Annotations of the stats, like type, unit, etc. + * @exponent: Used together with @flags to determine the unit. + * @size: The number of data items for this stats. + * Every data item is of type __u64. + * @offset: The offset of the stats to the start of stat structure in + * struture kvm or kvm_vcpu. + * @unused: Unused field for future usage. Always 0 for now. + * @name: The name string for the stats. Its size is indicated by the + * &kvm_stats_header->name_size. + */ +struct kvm_stats_desc { + __u32 flags; + __s16 exponent; + __u16 size; + __u32 offset; + __u32 unused; + char name[]; +}; + +#define KVM_GET_STATS_FD _IO(KVMIO, 0xce) + #endif /* __LINUX_KVM_H */ diff --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h index b9ac97b..8479af5 100644 --- a/linux-headers/linux/userfaultfd.h +++ b/linux-headers/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ - UFFD_FEATURE_MINOR_HUGETLBFS) + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -80,8 +81,8 @@ struct uffdio_zeropage) #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ struct uffdio_writeprotect) -#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \ - struct uffdio_continue) +#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) /* read() structure */ struct uffd_msg { @@ -185,6 +186,9 @@ struct uffdio_api { * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults * can be intercepted (via REGISTER_MODE_MINOR) for * hugetlbfs-backed pages. + * + * UFFD_FEATURE_MINOR_SHMEM indicates the same support as + * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -196,6 +200,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features; __u64 ioctls; diff --git a/pc-bios/README b/pc-bios/README index c101c9a..d344e3b 100644 --- a/pc-bios/README +++ b/pc-bios/README @@ -16,6 +16,10 @@ https://github.com/aik/SLOF, and the image currently in qemu is built from git tag qemu-slof-20210217. +- VOF (Virtual Open Firmware) is a minimalistic firmware to work with + -machine pseries,x-vof=on. When enabled, the firmware acts as a slim shim and + QEMU implements parts of the IEEE 1275 Open Firmware interface. + - sgabios (the Serial Graphics Adapter option ROM) provides a means for legacy x86 software to communicate with an attached serial console as if a video card were attached. The master sources reside in a subversion diff --git a/pc-bios/u-boot.e500 b/pc-bios/u-boot.e500 Binary files differindex d2e29f8..8e635c8 100644 --- a/pc-bios/u-boot.e500 +++ b/pc-bios/u-boot.e500 diff --git a/pc-bios/vof-nvram.bin b/pc-bios/vof-nvram.bin Binary files differnew file mode 100644 index 0000000..d183901 --- /dev/null +++ b/pc-bios/vof-nvram.bin diff --git a/pc-bios/vof.bin b/pc-bios/vof.bin Binary files differnew file mode 100755 index 0000000..300cb7c --- /dev/null +++ b/pc-bios/vof.bin diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile new file mode 100644 index 0000000..aa1678c --- /dev/null +++ b/pc-bios/vof/Makefile @@ -0,0 +1,23 @@ +all: build-all + +build-all: vof.bin + +CROSS ?= +CC = $(CROSS)gcc +LD = $(CROSS)ld +OBJCOPY = $(CROSS)objcopy + +%.o: %.S + $(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $< + +%.o: %.c + $(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o $@ $< + +vof.elf: entry.o main.o ci.o bootmem.o libc.o + $(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^ + +%.bin: %.elf + $(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@ + +clean: + rm -f *.o vof.bin vof.elf *~ diff --git a/pc-bios/vof/bootmem.c b/pc-bios/vof/bootmem.c new file mode 100644 index 0000000..771b9e9 --- /dev/null +++ b/pc-bios/vof/bootmem.c @@ -0,0 +1,14 @@ +#include "vof.h" + +void boot_from_memory(uint64_t initrd, uint64_t initrdsize) +{ + uint64_t kern[2]; + phandle chosen = ci_finddevice("/chosen"); + + if (ci_getprop(chosen, "qemu,boot-kernel", kern, sizeof(kern)) != + sizeof(kern)) { + return; + } + + do_boot(kern[0], initrd, initrdsize); +} diff --git a/pc-bios/vof/ci.c b/pc-bios/vof/ci.c new file mode 100644 index 0000000..fc4821b --- /dev/null +++ b/pc-bios/vof/ci.c @@ -0,0 +1,91 @@ +#include "vof.h" + +struct prom_args { + uint32_t service; + uint32_t nargs; + uint32_t nret; + uint32_t args[10]; +}; + +typedef unsigned long prom_arg_t; + +#define ADDR(x) (uint32_t)(x) + +static int prom_handle(struct prom_args *pargs) +{ + void *rtasbase; + uint32_t rtassize = 0; + phandle rtas; + + if (strcmp("call-method", (void *)(unsigned long)pargs->service)) { + return -1; + } + + if (strcmp("instantiate-rtas", (void *)(unsigned long)pargs->args[0])) { + return -1; + } + + rtas = ci_finddevice("/rtas"); + /* rtas-size is set by QEMU depending of FWNMI support */ + ci_getprop(rtas, "rtas-size", &rtassize, sizeof(rtassize)); + if (rtassize < hv_rtas_size) { + return -1; + } + + rtasbase = (void *)(unsigned long) pargs->args[2]; + + memcpy(rtasbase, hv_rtas, hv_rtas_size); + pargs->args[pargs->nargs] = 0; + pargs->args[pargs->nargs + 1] = pargs->args[2]; + + return 0; +} + +void prom_entry(uint32_t args) +{ + if (prom_handle((void *)(unsigned long) args)) { + ci_entry(args); + } +} + +static int call_ci(const char *service, int nargs, int nret, ...) +{ + int i; + struct prom_args args; + va_list list; + + args.service = ADDR(service); + args.nargs = nargs; + args.nret = nret; + + va_start(list, nret); + for (i = 0; i < nargs; i++) { + args.args[i] = va_arg(list, prom_arg_t); + } + va_end(list); + + for (i = 0; i < nret; i++) { + args.args[nargs + i] = 0; + } + + if (ci_entry((uint32_t)(&args)) < 0) { + return -1; + } + + return (nret > 0) ? args.args[nargs] : 0; +} + +void ci_panic(const char *str) +{ + call_ci("exit", 0, 0); +} + +phandle ci_finddevice(const char *path) +{ + return call_ci("finddevice", 1, 1, path); +} + +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len) +{ + return call_ci("getprop", 4, 1, ph, propname, prop, len); +} diff --git a/pc-bios/vof/entry.S b/pc-bios/vof/entry.S new file mode 100644 index 0000000..10a101f --- /dev/null +++ b/pc-bios/vof/entry.S @@ -0,0 +1,49 @@ +#define LOAD32(rn, name) \ + lis rn,name##@h; \ + ori rn,rn,name##@l + +#define ENTRY(func_name) \ + .text; \ + .align 2; \ + .globl .func_name; \ + .func_name: \ + .globl func_name; \ + func_name: + +#define KVMPPC_HCALL_BASE 0xf000 +#define KVMPPC_H_RTAS (KVMPPC_HCALL_BASE + 0x0) +#define KVMPPC_H_VOF_CLIENT (KVMPPC_HCALL_BASE + 0x5) + + . = 0x100 /* Do exactly as SLOF does */ + +ENTRY(_start) + LOAD32(2, __toc_start) + b entry_c + +ENTRY(_prom_entry) + LOAD32(2, __toc_start) + stwu %r1,-112(%r1) + stw %r31,104(%r1) + mflr %r31 + bl prom_entry + nop + mtlr %r31 + lwz %r31,104(%r1) + addi %r1,%r1,112 + blr + +ENTRY(ci_entry) + mr 4,3 + LOAD32(3,KVMPPC_H_VOF_CLIENT) + sc 1 + blr + +/* This is the actual RTAS blob copied to the OS at instantiate-rtas */ +ENTRY(hv_rtas) + mr %r4,%r3 + LOAD32(3,KVMPPC_H_RTAS) + sc 1 + blr + .globl hv_rtas_size +hv_rtas_size: + .long . - hv_rtas; diff --git a/pc-bios/vof/libc.c b/pc-bios/vof/libc.c new file mode 100644 index 0000000..fdbc30f --- /dev/null +++ b/pc-bios/vof/libc.c @@ -0,0 +1,66 @@ +#include "vof.h" + +int strlen(const char *s) +{ + int len = 0; + + while (*s != 0) { + len += 1; + s += 1; + } + + return len; +} + +int strcmp(const char *s1, const char *s2) +{ + while (*s1 != 0 && *s2 != 0) { + if (*s1 != *s2) { + break; + } + s1 += 1; + s2 += 1; + } + + return *s1 - *s2; +} + +void *memcpy(void *dest, const void *src, size_t n) +{ + char *cdest; + const char *csrc = src; + + cdest = dest; + while (n-- > 0) { + *cdest++ = *csrc++; + } + + return dest; +} + +int memcmp(const void *ptr1, const void *ptr2, size_t n) +{ + const unsigned char *p1 = ptr1; + const unsigned char *p2 = ptr2; + + while (n-- > 0) { + if (*p1 != *p2) { + return *p1 - *p2; + } + p1 += 1; + p2 += 1; + } + + return 0; +} + +void *memset(void *dest, int c, size_t size) +{ + unsigned char *d = (unsigned char *)dest; + + while (size-- > 0) { + *d++ = (unsigned char)c; + } + + return dest; +} diff --git a/pc-bios/vof/main.c b/pc-bios/vof/main.c new file mode 100644 index 0000000..0f0f6b4 --- /dev/null +++ b/pc-bios/vof/main.c @@ -0,0 +1,21 @@ +#include "vof.h" + +void do_boot(unsigned long addr, unsigned long _r3, unsigned long _r4) +{ + register unsigned long r3 __asm__("r3") = _r3; + register unsigned long r4 __asm__("r4") = _r4; + register unsigned long r5 __asm__("r5") = (unsigned long) _prom_entry; + + ((void (*)(void))(uint32_t)addr)(); +} + +void entry_c(void) +{ + register unsigned long r3 __asm__("r3"); + register unsigned long r4 __asm__("r4"); + register unsigned long r5 __asm__("r5"); + uint64_t initrd = r3, initrdsize = r4; + + boot_from_memory(initrd, initrdsize); + ci_panic("*** No boot target ***\n"); +} diff --git a/pc-bios/vof/vof.h b/pc-bios/vof/vof.h new file mode 100644 index 0000000..5f12c07 --- /dev/null +++ b/pc-bios/vof/vof.h @@ -0,0 +1,41 @@ +/* + * Virtual Open Firmware + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#include <stdarg.h> + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned long uint32_t; +typedef unsigned long long uint64_t; +#define NULL (0) +typedef unsigned long ihandle; +typedef unsigned long phandle; +typedef int size_t; + +/* globals */ +extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) */ + +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4); + +/* libc */ +int strlen(const char *s); +int strcmp(const char *s1, const char *s2); +void *memcpy(void *dest, const void *src, size_t n); +int memcmp(const void *ptr1, const void *ptr2, size_t n); +void *memmove(void *dest, const void *src, size_t n); +void *memset(void *dest, int c, size_t size); + +/* CI wrappers */ +void ci_panic(const char *str); +phandle ci_finddevice(const char *path); +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len); + +/* booting from -kernel */ +void boot_from_memory(uint64_t initrd, uint64_t initrdsize); + +/* Entry points for CI and RTAS */ +extern uint32_t ci_entry(uint32_t params); +extern unsigned long hv_rtas(unsigned long params); +extern unsigned int hv_rtas_size; diff --git a/pc-bios/vof/vof.lds b/pc-bios/vof/vof.lds new file mode 100644 index 0000000..1506ab4 --- /dev/null +++ b/pc-bios/vof/vof.lds @@ -0,0 +1,48 @@ +OUTPUT_FORMAT("elf32-powerpc") +OUTPUT_ARCH(powerpc:common) + +/* set the entry point */ +ENTRY ( __start ) + +SECTIONS { + __executable_start = .; + + .text : { + *(.text) + } + + __etext = .; + + . = ALIGN(8); + + .data : { + *(.data) + *(.rodata .rodata.*) + *(.got1) + *(.sdata) + *(.opd) + } + + /* FIXME bss at end ??? */ + + . = ALIGN(8); + __bss_start = .; + .bss : { + *(.sbss) *(.scommon) + *(.dynbss) + *(.bss) + } + + . = ALIGN(8); + __bss_end = .; + __bss_size = (__bss_end - __bss_start); + + . = ALIGN(256); + __toc_start = DEFINED (.TOC.) ? .TOC. : ADDR (.got) + 0x8000; + .got : + { + *(.toc .got) + } + . = ALIGN(8); + __toc_end = .; +} diff --git a/roms/u-boot b/roms/u-boot -Subproject b46dd116ce03e235f2a7d4843c6278e1da44b5e +Subproject 840658b093976390e9537724f802281c9c8439f diff --git a/softmmu/memory.c b/softmmu/memory.c index f016151..cea2f62 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -2027,6 +2027,70 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr) return imrc->num_indexes(iommu_mr); } +RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr) +{ + if (!memory_region_is_mapped(mr) || !memory_region_is_ram(mr)) { + return NULL; + } + return mr->rdm; +} + +void memory_region_set_ram_discard_manager(MemoryRegion *mr, + RamDiscardManager *rdm) +{ + g_assert(memory_region_is_ram(mr) && !memory_region_is_mapped(mr)); + g_assert(!rdm || !mr->rdm); + mr->rdm = rdm; +} + +uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->get_min_granularity); + return rdmc->get_min_granularity(rdm, mr); +} + +bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *section) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->is_populated); + return rdmc->is_populated(rdm, section); +} + +int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamPopulate replay_fn, + void *opaque) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->replay_populated); + return rdmc->replay_populated(rdm, section, replay_fn, opaque); +} + +void ram_discard_manager_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->register_listener); + rdmc->register_listener(rdm, rdl, section); +} + +void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->unregister_listener); + rdmc->unregister_listener(rdm, rdl); +} + void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) { uint8_t mask = 1 << client; @@ -2637,6 +2701,33 @@ MemoryRegionSection memory_region_find(MemoryRegion *mr, return ret; } +MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s) +{ + MemoryRegionSection *tmp = g_new(MemoryRegionSection, 1); + + *tmp = *s; + if (tmp->mr) { + memory_region_ref(tmp->mr); + } + if (tmp->fv) { + bool ret = flatview_ref(tmp->fv); + + g_assert(ret); + } + return tmp; +} + +void memory_region_section_free_copy(MemoryRegionSection *s) +{ + if (s->fv) { + flatview_unref(s->fv); + } + if (s->mr) { + memory_region_unref(s->mr); + } + g_free(s); +} + bool memory_region_present(MemoryRegion *container, hwaddr addr) { MemoryRegion *mr; @@ -3320,10 +3411,17 @@ static const TypeInfo iommu_memory_region_info = { .abstract = true, }; +static const TypeInfo ram_discard_manager_info = { + .parent = TYPE_INTERFACE, + .name = TYPE_RAM_DISCARD_MANAGER, + .class_size = sizeof(RamDiscardManagerClass), +}; + static void memory_register_types(void) { type_register_static(&memory_region_info); type_register_static(&iommu_memory_region_info); + type_register_static(&ram_discard_manager_info); } type_init(memory_register_types) diff --git a/softmmu/physmem.c b/softmmu/physmem.c index 9b171c9..3c1912a 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -3684,56 +3684,106 @@ void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root) } } -/* - * If positive, discarding RAM is disabled. If negative, discarding RAM is - * required to work and cannot be disabled. - */ -static int ram_block_discard_disabled; +/* Require any discards to work. */ +static unsigned int ram_block_discard_required_cnt; +/* Require only coordinated discards to work. */ +static unsigned int ram_block_coordinated_discard_required_cnt; +/* Disable any discards. */ +static unsigned int ram_block_discard_disabled_cnt; +/* Disable only uncoordinated discards. */ +static unsigned int ram_block_uncoordinated_discard_disabled_cnt; +static QemuMutex ram_block_discard_disable_mutex; + +static void ram_block_discard_disable_mutex_lock(void) +{ + static gsize initialized; + + if (g_once_init_enter(&initialized)) { + qemu_mutex_init(&ram_block_discard_disable_mutex); + g_once_init_leave(&initialized, 1); + } + qemu_mutex_lock(&ram_block_discard_disable_mutex); +} + +static void ram_block_discard_disable_mutex_unlock(void) +{ + qemu_mutex_unlock(&ram_block_discard_disable_mutex); +} int ram_block_discard_disable(bool state) { - int old; + int ret = 0; + ram_block_discard_disable_mutex_lock(); if (!state) { - qatomic_dec(&ram_block_discard_disabled); - return 0; + ram_block_discard_disabled_cnt--; + } else if (ram_block_discard_required_cnt || + ram_block_coordinated_discard_required_cnt) { + ret = -EBUSY; + } else { + ram_block_discard_disabled_cnt++; } + ram_block_discard_disable_mutex_unlock(); + return ret; +} - do { - old = qatomic_read(&ram_block_discard_disabled); - if (old < 0) { - return -EBUSY; - } - } while (qatomic_cmpxchg(&ram_block_discard_disabled, - old, old + 1) != old); - return 0; +int ram_block_uncoordinated_discard_disable(bool state) +{ + int ret = 0; + + ram_block_discard_disable_mutex_lock(); + if (!state) { + ram_block_uncoordinated_discard_disabled_cnt--; + } else if (ram_block_discard_required_cnt) { + ret = -EBUSY; + } else { + ram_block_uncoordinated_discard_disabled_cnt++; + } + ram_block_discard_disable_mutex_unlock(); + return ret; } int ram_block_discard_require(bool state) { - int old; + int ret = 0; + ram_block_discard_disable_mutex_lock(); if (!state) { - qatomic_inc(&ram_block_discard_disabled); - return 0; + ram_block_discard_required_cnt--; + } else if (ram_block_discard_disabled_cnt || + ram_block_uncoordinated_discard_disabled_cnt) { + ret = -EBUSY; + } else { + ram_block_discard_required_cnt++; } + ram_block_discard_disable_mutex_unlock(); + return ret; +} - do { - old = qatomic_read(&ram_block_discard_disabled); - if (old > 0) { - return -EBUSY; - } - } while (qatomic_cmpxchg(&ram_block_discard_disabled, - old, old - 1) != old); - return 0; +int ram_block_coordinated_discard_require(bool state) +{ + int ret = 0; + + ram_block_discard_disable_mutex_lock(); + if (!state) { + ram_block_coordinated_discard_required_cnt--; + } else if (ram_block_discard_disabled_cnt) { + ret = -EBUSY; + } else { + ram_block_coordinated_discard_required_cnt++; + } + ram_block_discard_disable_mutex_unlock(); + return ret; } bool ram_block_discard_is_disabled(void) { - return qatomic_read(&ram_block_discard_disabled) > 0; + return qatomic_read(&ram_block_discard_disabled_cnt) || + qatomic_read(&ram_block_uncoordinated_discard_disabled_cnt); } bool ram_block_discard_is_required(void) { - return qatomic_read(&ram_block_discard_disabled) < 0; + return qatomic_read(&ram_block_discard_required_cnt) || + qatomic_read(&ram_block_coordinated_discard_required_cnt); } diff --git a/target/ppc/arch_dump.c b/target/ppc/arch_dump.c index 9210e61..bb392f6 100644 --- a/target/ppc/arch_dump.c +++ b/target/ppc/arch_dump.c @@ -227,22 +227,20 @@ int cpu_get_dump_info(ArchDumpInfo *info, const struct GuestPhysBlockList *guest_phys_blocks) { PowerPCCPU *cpu; - PowerPCCPUClass *pcc; if (first_cpu == NULL) { return -1; } cpu = POWERPC_CPU(first_cpu); - pcc = POWERPC_CPU_GET_CLASS(cpu); info->d_machine = PPC_ELF_MACHINE; info->d_class = ELFCLASS; - if ((*pcc->interrupts_big_endian)(cpu)) { - info->d_endian = ELFDATA2MSB; - } else { + if (ppc_interrupts_little_endian(cpu)) { info->d_endian = ELFDATA2LSB; + } else { + info->d_endian = ELFDATA2MSB; } /* 64KB is the max page size for pseries kernel */ if (strncmp(object_get_typename(qdev_get_machine()), diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h index 06b6571..5800fa3 100644 --- a/target/ppc/cpu-qom.h +++ b/target/ppc/cpu-qom.h @@ -198,8 +198,6 @@ struct PowerPCCPUClass { int n_host_threads; void (*init_proc)(CPUPPCState *env); int (*check_pow)(CPUPPCState *env); - int (*handle_mmu_fault)(PowerPCCPU *cpu, vaddr eaddr, int rwx, int mmu_idx); - bool (*interrupts_big_endian)(PowerPCCPU *cpu); }; #ifndef CONFIG_USER_ONLY diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c index 19d67b5..a292998 100644 --- a/target/ppc/cpu.c +++ b/target/ppc/cpu.c @@ -72,7 +72,7 @@ void ppc_store_sdr1(CPUPPCState *env, target_ulong value) { PowerPCCPU *cpu = env_archcpu(env); qemu_log_mask(CPU_LOG_MMU, "%s: " TARGET_FMT_lx "\n", __func__, value); - assert(!cpu->vhyp); + assert(!cpu->env.has_hv_mode || !cpu->vhyp); #if defined(TARGET_PPC64) if (mmu_is_64bit(env->mmu_model)) { target_ulong sdr_mask = SDR_64_HTABORG | SDR_64_HTABSIZE; diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index b4de0db..93d308a 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -2643,6 +2643,21 @@ static inline bool ppc_has_spr(PowerPCCPU *cpu, int spr) return cpu->env.spr_cb[spr].name != NULL; } +static inline bool ppc_interrupts_little_endian(PowerPCCPU *cpu) +{ + PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu); + + /* + * Only models that have an LPCR and know about LPCR_ILE can do little + * endian. + */ + if (pcc->lpcr_mask & LPCR_ILE) { + return !!(cpu->env.spr[SPR_LPCR] & LPCR_ILE); + } + + return false; +} + void dump_mmu(CPUPPCState *env); void ppc_maybe_bswap_register(CPUPPCState *env, uint8_t *mem_buf, int len); diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c index d0411e7..505a0ed 100644 --- a/target/ppc/cpu_init.c +++ b/target/ppc/cpu_init.c @@ -2666,18 +2666,6 @@ static int check_pow_hid0_74xx(CPUPPCState *env) return 0; } -static bool ppc_cpu_interrupts_big_endian_always(PowerPCCPU *cpu) -{ - return true; -} - -#ifdef TARGET_PPC64 -static bool ppc_cpu_interrupts_big_endian_lpcr(PowerPCCPU *cpu) -{ - return !(cpu->env.spr[SPR_LPCR] & LPCR_ILE); -} -#endif - /*****************************************************************************/ /* PowerPC implementations definitions */ @@ -4578,9 +4566,6 @@ POWERPC_FAMILY(601)(ObjectClass *oc, void *data) (1ull << MSR_IR) | (1ull << MSR_DR); pcc->mmu_model = POWERPC_MMU_601; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_601; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_601; @@ -4623,9 +4608,6 @@ POWERPC_FAMILY(601v)(ObjectClass *oc, void *data) (1ull << MSR_IR) | (1ull << MSR_DR); pcc->mmu_model = POWERPC_MMU_601; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_601; pcc->flags = POWERPC_FLAG_SE | POWERPC_FLAG_RTC_CLK | POWERPC_FLAG_HID0_LE; @@ -4889,9 +4871,6 @@ POWERPC_FAMILY(604)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_604; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_604; @@ -4973,9 +4952,6 @@ POWERPC_FAMILY(604E)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_604; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_604; @@ -5044,9 +5020,6 @@ POWERPC_FAMILY(740)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_7x0; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_750; @@ -5124,9 +5097,6 @@ POWERPC_FAMILY(750)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_7x0; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_750; @@ -5327,9 +5297,6 @@ POWERPC_FAMILY(750cl)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_7x0; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_750; @@ -5410,9 +5377,6 @@ POWERPC_FAMILY(750cx)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_7x0; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_750; @@ -5498,9 +5462,6 @@ POWERPC_FAMILY(750fx)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_7x0; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_750; @@ -5586,9 +5547,6 @@ POWERPC_FAMILY(750gx)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_7x0; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_750; @@ -5828,9 +5786,6 @@ POWERPC_FAMILY(7400)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_74xx; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_7400; @@ -5914,9 +5869,6 @@ POWERPC_FAMILY(7410)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_74xx; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_7400; @@ -6743,9 +6695,6 @@ POWERPC_FAMILY(e600)(ObjectClass *oc, void *data) (1ull << MSR_RI) | (1ull << MSR_LE); pcc->mmu_model = POWERPC_MMU_32B; -#if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash32_handle_mmu_fault; -#endif pcc->excp_model = POWERPC_EXCP_74xx; pcc->bus_model = PPC_FLAGS_INPUT_6xx; pcc->bfd_mach = bfd_mach_ppc_7400; @@ -7505,7 +7454,6 @@ POWERPC_FAMILY(970)(ObjectClass *oc, void *data) (1ull << MSR_RI); pcc->mmu_model = POWERPC_MMU_64B; #if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash64_handle_mmu_fault; pcc->hash64_opts = &ppc_hash64_opts_basic; #endif pcc->excp_model = POWERPC_EXCP_970; @@ -7583,7 +7531,6 @@ POWERPC_FAMILY(POWER5P)(ObjectClass *oc, void *data) LPCR_RMI | LPCR_HDICE; pcc->mmu_model = POWERPC_MMU_2_03; #if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash64_handle_mmu_fault; pcc->hash64_opts = &ppc_hash64_opts_basic; pcc->lrg_decr_bits = 32; #endif @@ -7727,7 +7674,6 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data) pcc->lpcr_pm = LPCR_P7_PECE0 | LPCR_P7_PECE1 | LPCR_P7_PECE2; pcc->mmu_model = POWERPC_MMU_2_06; #if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash64_handle_mmu_fault; pcc->hash64_opts = &ppc_hash64_opts_POWER7; pcc->lrg_decr_bits = 32; #endif @@ -7740,7 +7686,6 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data) POWERPC_FLAG_VSX; pcc->l1_dcache_size = 0x8000; pcc->l1_icache_size = 0x8000; - pcc->interrupts_big_endian = ppc_cpu_interrupts_big_endian_lpcr; } static void init_proc_POWER8(CPUPPCState *env) @@ -7904,7 +7849,6 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data) LPCR_P8_PECE3 | LPCR_P8_PECE4; pcc->mmu_model = POWERPC_MMU_2_07; #if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc_hash64_handle_mmu_fault; pcc->hash64_opts = &ppc_hash64_opts_POWER7; pcc->lrg_decr_bits = 32; pcc->n_host_threads = 8; @@ -7918,7 +7862,6 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data) POWERPC_FLAG_VSX | POWERPC_FLAG_TM; pcc->l1_dcache_size = 0x8000; pcc->l1_icache_size = 0x8000; - pcc->interrupts_big_endian = ppc_cpu_interrupts_big_endian_lpcr; } #ifdef CONFIG_SOFTMMU @@ -8120,7 +8063,6 @@ POWERPC_FAMILY(POWER9)(ObjectClass *oc, void *data) pcc->lpcr_pm = LPCR_PDEE | LPCR_HDEE | LPCR_EEE | LPCR_DEE | LPCR_OEE; pcc->mmu_model = POWERPC_MMU_3_00; #if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc64_v3_handle_mmu_fault; /* segment page size remain the same */ pcc->hash64_opts = &ppc_hash64_opts_POWER7; pcc->radix_page_info = &POWER9_radix_page_info; @@ -8136,7 +8078,6 @@ POWERPC_FAMILY(POWER9)(ObjectClass *oc, void *data) POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV; pcc->l1_dcache_size = 0x8000; pcc->l1_icache_size = 0x8000; - pcc->interrupts_big_endian = ppc_cpu_interrupts_big_endian_lpcr; } #ifdef CONFIG_SOFTMMU @@ -8332,7 +8273,6 @@ POWERPC_FAMILY(POWER10)(ObjectClass *oc, void *data) pcc->lpcr_pm = LPCR_PDEE | LPCR_HDEE | LPCR_EEE | LPCR_DEE | LPCR_OEE; pcc->mmu_model = POWERPC_MMU_3_00; #if defined(CONFIG_SOFTMMU) - pcc->handle_mmu_fault = ppc64_v3_handle_mmu_fault; /* segment page size remain the same */ pcc->hash64_opts = &ppc_hash64_opts_POWER7; pcc->radix_page_info = &POWER10_radix_page_info; @@ -8347,7 +8287,6 @@ POWERPC_FAMILY(POWER10)(ObjectClass *oc, void *data) POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV; pcc->l1_dcache_size = 0x8000; pcc->l1_icache_size = 0x8000; - pcc->interrupts_big_endian = ppc_cpu_interrupts_big_endian_lpcr; } #if !defined(CONFIG_USER_ONLY) @@ -8908,9 +8847,11 @@ static void ppc_cpu_reset(DeviceState *dev) #if !defined(CONFIG_USER_ONLY) env->nip = env->hreset_vector | env->excp_prefix; +#if defined(CONFIG_TCG) if (env->mmu_model != POWERPC_MMU_REAL) { ppc_tlb_invalidate_all(env); } +#endif /* CONFIG_TCG */ #endif hreg_compute_hflags(env); @@ -9094,7 +9035,6 @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data) device_class_set_parent_unrealize(dc, ppc_cpu_unrealize, &pcc->parent_unrealize); pcc->pvr_match = ppc_pvr_match_default; - pcc->interrupts_big_endian = ppc_cpu_interrupts_big_endian_always; device_class_set_props(dc, ppc_cpu_properties); device_class_set_parent_reset(dc, ppc_cpu_reset, &pcc->parent_reset); diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c index fd147e2..a79a0ed 100644 --- a/target/ppc/excp_helper.c +++ b/target/ppc/excp_helper.c @@ -1099,7 +1099,6 @@ void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, target_ulong vector) { PowerPCCPU *cpu = POWERPC_CPU(cs); CPUPPCState *env = &cpu->env; - PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu); target_ulong msr = 0; /* @@ -1108,7 +1107,7 @@ void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, target_ulong vector) */ msr = (1ULL << MSR_ME); msr |= env->msr & (1ULL << MSR_SF); - if (!(*pcc->interrupts_big_endian)(cpu)) { + if (ppc_interrupts_little_endian(cpu)) { msr |= (1ULL << MSR_LE); } diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index 104a308..dc93b99 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -89,6 +89,7 @@ static int cap_ppc_count_cache_flush_assist; static int cap_ppc_nested_kvm_hv; static int cap_large_decr; static int cap_fwnmi; +static int cap_rpt_invalidate; static uint32_t debug_inst_opcode; @@ -152,6 +153,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) exit(1); } + cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE); kvm_ppc_register_host_cpu_type(); return 0; @@ -2040,6 +2042,11 @@ void kvmppc_enable_h_page_init(void) kvmppc_enable_hcall(kvm_state, H_PAGE_INIT); } +void kvmppc_enable_h_rpt_invalidate(void) +{ + kvmppc_enable_hcall(kvm_state, H_RPT_INVALIDATE); +} + void kvmppc_set_papr(PowerPCCPU *cpu) { CPUState *cs = CPU(cpu); @@ -2551,6 +2558,11 @@ int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable) return 0; } +int kvmppc_has_cap_rpt_invalidate(void) +{ + return cap_rpt_invalidate; +} + PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void) { uint32_t host_pvr = mfpvr(); diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h index 989f61a..ee9325b 100644 --- a/target/ppc/kvm_ppc.h +++ b/target/ppc/kvm_ppc.h @@ -24,6 +24,7 @@ void kvmppc_enable_logical_ci_hcalls(void); void kvmppc_enable_set_mode_hcall(void); void kvmppc_enable_clear_ref_mod_hcalls(void); void kvmppc_enable_h_page_init(void); +void kvmppc_enable_h_rpt_invalidate(void); void kvmppc_set_papr(PowerPCCPU *cpu); int kvmppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr); void kvmppc_set_mpic_proxy(PowerPCCPU *cpu, int mpic_proxy); @@ -71,6 +72,7 @@ bool kvmppc_has_cap_nested_kvm_hv(void); int kvmppc_set_cap_nested_kvm_hv(int enable); int kvmppc_get_cap_large_decr(void); int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable); +int kvmppc_has_cap_rpt_invalidate(void); int kvmppc_enable_hwrng(void); int kvmppc_put_books_sregs(PowerPCCPU *cpu); PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void); @@ -150,6 +152,11 @@ static inline void kvmppc_enable_h_page_init(void) { } +static inline void kvmppc_enable_h_rpt_invalidate(void) +{ + g_assert_not_reached(); +} + static inline void kvmppc_set_papr(PowerPCCPU *cpu) { } @@ -381,6 +388,11 @@ static inline int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable) return -1; } +static inline int kvmppc_has_cap_rpt_invalidate(void) +{ + return false; +} + static inline int kvmppc_enable_hwrng(void) { return -1; diff --git a/target/ppc/mmu-book3s-v3.c b/target/ppc/mmu-book3s-v3.c index c78fd8d..f4985ba 100644 --- a/target/ppc/mmu-book3s-v3.c +++ b/target/ppc/mmu-book3s-v3.c @@ -23,25 +23,6 @@ #include "mmu-book3s-v3.h" #include "mmu-radix64.h" -int ppc64_v3_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx, - int mmu_idx) -{ - if (ppc64_v3_radix(cpu)) { /* Guest uses radix */ - return ppc_radix64_handle_mmu_fault(cpu, eaddr, rwx, mmu_idx); - } else { /* Guest uses hash */ - return ppc_hash64_handle_mmu_fault(cpu, eaddr, rwx, mmu_idx); - } -} - -hwaddr ppc64_v3_get_phys_page_debug(PowerPCCPU *cpu, vaddr eaddr) -{ - if (ppc64_v3_radix(cpu)) { - return ppc_radix64_get_phys_page_debug(cpu, eaddr); - } else { - return ppc_hash64_get_phys_page_debug(cpu, eaddr); - } -} - bool ppc64_v3_get_pate(PowerPCCPU *cpu, target_ulong lpid, ppc_v3_pate_t *entry) { uint64_t patb = cpu->env.spr[SPR_PTCR] & PTCR_PATB; diff --git a/target/ppc/mmu-book3s-v3.h b/target/ppc/mmu-book3s-v3.h index 7b89be5..d6d5ed8 100644 --- a/target/ppc/mmu-book3s-v3.h +++ b/target/ppc/mmu-book3s-v3.h @@ -21,6 +21,7 @@ #define PPC_MMU_BOOK3S_V3_H #include "mmu-hash64.h" +#include "mmu-books.h" #ifndef CONFIG_USER_ONLY @@ -67,11 +68,6 @@ static inline bool ppc64_v3_radix(PowerPCCPU *cpu) return !!(cpu->env.spr[SPR_LPCR] & LPCR_HR); } -hwaddr ppc64_v3_get_phys_page_debug(PowerPCCPU *cpu, vaddr eaddr); - -int ppc64_v3_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx, - int mmu_idx); - static inline hwaddr ppc_hash64_hpt_base(PowerPCCPU *cpu) { uint64_t base; diff --git a/target/ppc/mmu-books.h b/target/ppc/mmu-books.h new file mode 100644 index 0000000..0d12551 --- /dev/null +++ b/target/ppc/mmu-books.h @@ -0,0 +1,30 @@ +/* + * PowerPC BookS emulation generic mmu definitions for qemu. + * + * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef PPC_MMU_BOOKS_H +#define PPC_MMU_BOOKS_H + +/* + * These correspond to the mmu_idx values computed in + * hreg_compute_hflags_value. See the tables therein + */ +static inline bool mmuidx_pr(int idx) { return !(idx & 1); } +static inline bool mmuidx_real(int idx) { return idx & 2; } +static inline bool mmuidx_hv(int idx) { return idx & 4; } +#endif /* PPC_MMU_BOOKS_H */ diff --git a/target/ppc/mmu-hash32.c b/target/ppc/mmu-hash32.c index 9f0a497..3957aab 100644 --- a/target/ppc/mmu-hash32.c +++ b/target/ppc/mmu-hash32.c @@ -25,9 +25,10 @@ #include "kvm_ppc.h" #include "internal.h" #include "mmu-hash32.h" +#include "mmu-books.h" #include "exec/log.h" -/* #define DEBUG_BAT */ +/* #define DEBUG_BATS */ #ifdef DEBUG_BATS # define LOG_BATS(...) qemu_log_mask(CPU_LOG_MMU, __VA_ARGS__) @@ -86,25 +87,22 @@ static int ppc_hash32_pp_prot(int key, int pp, int nx) return prot; } -static int ppc_hash32_pte_prot(PowerPCCPU *cpu, +static int ppc_hash32_pte_prot(int mmu_idx, target_ulong sr, ppc_hash_pte32_t pte) { - CPUPPCState *env = &cpu->env; unsigned pp, key; - key = !!(msr_pr ? (sr & SR32_KP) : (sr & SR32_KS)); + key = !!(mmuidx_pr(mmu_idx) ? (sr & SR32_KP) : (sr & SR32_KS)); pp = pte.pte1 & HPTE32_R_PP; return ppc_hash32_pp_prot(key, pp, !!(sr & SR32_NX)); } -static target_ulong hash32_bat_size(PowerPCCPU *cpu, +static target_ulong hash32_bat_size(int mmu_idx, target_ulong batu, target_ulong batl) { - CPUPPCState *env = &cpu->env; - - if ((msr_pr && !(batu & BATU32_VP)) - || (!msr_pr && !(batu & BATU32_VS))) { + if ((mmuidx_pr(mmu_idx) && !(batu & BATU32_VP)) + || (!mmuidx_pr(mmu_idx) && !(batu & BATU32_VS))) { return 0; } @@ -137,14 +135,13 @@ static target_ulong hash32_bat_601_size(PowerPCCPU *cpu, return BATU32_BEPI & ~((batl & BATL32_601_BL) << 17); } -static int hash32_bat_601_prot(PowerPCCPU *cpu, +static int hash32_bat_601_prot(int mmu_idx, target_ulong batu, target_ulong batl) { - CPUPPCState *env = &cpu->env; int key, pp; pp = batu & BATU32_601_PP; - if (msr_pr == 0) { + if (mmuidx_pr(mmu_idx) == 0) { key = !!(batu & BATU32_601_KS); } else { key = !!(batu & BATU32_601_KP); @@ -153,7 +150,8 @@ static int hash32_bat_601_prot(PowerPCCPU *cpu, } static hwaddr ppc_hash32_bat_lookup(PowerPCCPU *cpu, target_ulong ea, - MMUAccessType access_type, int *prot) + MMUAccessType access_type, int *prot, + int mmu_idx) { CPUPPCState *env = &cpu->env; target_ulong *BATlt, *BATut; @@ -177,7 +175,7 @@ static hwaddr ppc_hash32_bat_lookup(PowerPCCPU *cpu, target_ulong ea, if (unlikely(env->mmu_model == POWERPC_MMU_601)) { mask = hash32_bat_601_size(cpu, batu, batl); } else { - mask = hash32_bat_size(cpu, batu, batl); + mask = hash32_bat_size(mmu_idx, batu, batl); } LOG_BATS("%s: %cBAT%d v " TARGET_FMT_lx " BATu " TARGET_FMT_lx " BATl " TARGET_FMT_lx "\n", __func__, @@ -187,7 +185,7 @@ static hwaddr ppc_hash32_bat_lookup(PowerPCCPU *cpu, target_ulong ea, hwaddr raddr = (batl & mask) | (ea & ~mask); if (unlikely(env->mmu_model == POWERPC_MMU_601)) { - *prot = hash32_bat_601_prot(cpu, batu, batl); + *prot = hash32_bat_601_prot(mmu_idx, batu, batl); } else { *prot = hash32_bat_prot(cpu, batu, batl); } @@ -199,6 +197,9 @@ static hwaddr ppc_hash32_bat_lookup(PowerPCCPU *cpu, target_ulong ea, /* No hit */ #if defined(DEBUG_BATS) if (qemu_log_enabled()) { + target_ulong *BATu, *BATl; + target_ulong BEPIl, BEPIu, bl; + LOG_BATS("no BAT match for " TARGET_FMT_lx ":\n", ea); for (i = 0; i < 4; i++) { BATu = &BATut[i]; @@ -218,14 +219,15 @@ static hwaddr ppc_hash32_bat_lookup(PowerPCCPU *cpu, target_ulong ea, return -1; } -static int ppc_hash32_direct_store(PowerPCCPU *cpu, target_ulong sr, - target_ulong eaddr, - MMUAccessType access_type, - hwaddr *raddr, int *prot) +static bool ppc_hash32_direct_store(PowerPCCPU *cpu, target_ulong sr, + target_ulong eaddr, + MMUAccessType access_type, + hwaddr *raddr, int *prot, int mmu_idx, + bool guest_visible) { CPUState *cs = CPU(cpu); CPUPPCState *env = &cpu->env; - int key = !!(msr_pr ? (sr & SR32_KP) : (sr & SR32_KS)); + int key = !!(mmuidx_pr(mmu_idx) ? (sr & SR32_KP) : (sr & SR32_KS)); qemu_log_mask(CPU_LOG_MMU, "direct store...\n"); @@ -238,17 +240,23 @@ static int ppc_hash32_direct_store(PowerPCCPU *cpu, target_ulong sr, */ *raddr = ((sr & 0xF) << 28) | (eaddr & 0x0FFFFFFF); *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; - return 0; + return true; } if (access_type == MMU_INST_FETCH) { /* No code fetch is allowed in direct-store areas */ - cs->exception_index = POWERPC_EXCP_ISI; - env->error_code = 0x10000000; - return 1; + if (guest_visible) { + cs->exception_index = POWERPC_EXCP_ISI; + env->error_code = 0x10000000; + } + return false; } - switch (env->access_type) { + /* + * From ppc_cpu_get_phys_page_debug, env->access_type is not set. + * Assume ACCESS_INT for that case. + */ + switch (guest_visible ? env->access_type : ACCESS_INT) { case ACCESS_INT: /* Integer load/store : only access allowed */ break; @@ -257,7 +265,7 @@ static int ppc_hash32_direct_store(PowerPCCPU *cpu, target_ulong sr, cs->exception_index = POWERPC_EXCP_ALIGN; env->error_code = POWERPC_EXCP_ALIGN_FP; env->spr[SPR_DAR] = eaddr; - return 1; + return false; case ACCESS_RES: /* lwarx, ldarx or srwcx. */ env->error_code = 0; @@ -267,7 +275,7 @@ static int ppc_hash32_direct_store(PowerPCCPU *cpu, target_ulong sr, } else { env->spr[SPR_DSISR] = 0x04000000; } - return 1; + return false; case ACCESS_CACHE: /* * dcba, dcbt, dcbtst, dcbf, dcbi, dcbst, dcbz, or icbi @@ -276,7 +284,7 @@ static int ppc_hash32_direct_store(PowerPCCPU *cpu, target_ulong sr, * no-op, it's quite easy :-) */ *raddr = eaddr; - return 0; + return true; case ACCESS_EXT: /* eciwx or ecowx */ cs->exception_index = POWERPC_EXCP_DSI; @@ -287,16 +295,18 @@ static int ppc_hash32_direct_store(PowerPCCPU *cpu, target_ulong sr, } else { env->spr[SPR_DSISR] = 0x04100000; } - return 1; + return false; default: - cpu_abort(cs, "ERROR: instruction should not need " - "address translation\n"); + cpu_abort(cs, "ERROR: insn should not need address translation\n"); } - if ((access_type == MMU_DATA_STORE || key != 1) && - (access_type == MMU_DATA_LOAD || key != 0)) { + + *prot = key ? PAGE_READ | PAGE_WRITE : PAGE_READ; + if (*prot & prot_for_access_type(access_type)) { *raddr = eaddr; - return 0; - } else { + return true; + } + + if (guest_visible) { cs->exception_index = POWERPC_EXCP_DSI; env->error_code = 0; env->spr[SPR_DAR] = eaddr; @@ -305,8 +315,8 @@ static int ppc_hash32_direct_store(PowerPCCPU *cpu, target_ulong sr, } else { env->spr[SPR_DSISR] = 0x08000000; } - return 1; } + return false; } hwaddr get_pteg_offset32(PowerPCCPU *cpu, hwaddr hash) @@ -415,8 +425,9 @@ static hwaddr ppc_hash32_pte_raddr(target_ulong sr, ppc_hash_pte32_t pte, return (rpn & ~mask) | (eaddr & mask); } -int ppc_hash32_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx, - int mmu_idx) +bool ppc_hash32_xlate(PowerPCCPU *cpu, vaddr eaddr, MMUAccessType access_type, + hwaddr *raddrp, int *psizep, int *protp, int mmu_idx, + bool guest_visible) { CPUState *cs = CPU(cpu); CPUPPCState *env = &cpu->env; @@ -425,48 +436,45 @@ int ppc_hash32_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx, ppc_hash_pte32_t pte; int prot; int need_prot; - MMUAccessType access_type; hwaddr raddr; - assert((rwx == 0) || (rwx == 1) || (rwx == 2)); - access_type = rwx; - need_prot = prot_for_access_type(access_type); + /* There are no hash32 large pages. */ + *psizep = TARGET_PAGE_BITS; /* 1. Handle real mode accesses */ - if (access_type == MMU_INST_FETCH ? !msr_ir : !msr_dr) { + if (mmuidx_real(mmu_idx)) { /* Translation is off */ - raddr = eaddr; - tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK, - PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx, - TARGET_PAGE_SIZE); - return 0; + *raddrp = eaddr; + *protp = PAGE_READ | PAGE_WRITE | PAGE_EXEC; + return true; } + need_prot = prot_for_access_type(access_type); + /* 2. Check Block Address Translation entries (BATs) */ if (env->nb_BATs != 0) { - raddr = ppc_hash32_bat_lookup(cpu, eaddr, access_type, &prot); + raddr = ppc_hash32_bat_lookup(cpu, eaddr, access_type, protp, mmu_idx); if (raddr != -1) { - if (need_prot & ~prot) { - if (access_type == MMU_INST_FETCH) { - cs->exception_index = POWERPC_EXCP_ISI; - env->error_code = 0x08000000; - } else { - cs->exception_index = POWERPC_EXCP_DSI; - env->error_code = 0; - env->spr[SPR_DAR] = eaddr; - if (access_type == MMU_DATA_STORE) { - env->spr[SPR_DSISR] = 0x0a000000; + if (need_prot & ~*protp) { + if (guest_visible) { + if (access_type == MMU_INST_FETCH) { + cs->exception_index = POWERPC_EXCP_ISI; + env->error_code = 0x08000000; } else { - env->spr[SPR_DSISR] = 0x08000000; + cs->exception_index = POWERPC_EXCP_DSI; + env->error_code = 0; + env->spr[SPR_DAR] = eaddr; + if (access_type == MMU_DATA_STORE) { + env->spr[SPR_DSISR] = 0x0a000000; + } else { + env->spr[SPR_DSISR] = 0x08000000; + } } } - return 1; + return false; } - - tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, - raddr & TARGET_PAGE_MASK, prot, mmu_idx, - TARGET_PAGE_SIZE); - return 0; + *raddrp = raddr; + return true; } } @@ -475,67 +483,65 @@ int ppc_hash32_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx, /* 4. Handle direct store segments */ if (sr & SR32_T) { - if (ppc_hash32_direct_store(cpu, sr, eaddr, access_type, - &raddr, &prot) == 0) { - tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, - raddr & TARGET_PAGE_MASK, prot, mmu_idx, - TARGET_PAGE_SIZE); - return 0; - } else { - return 1; - } + return ppc_hash32_direct_store(cpu, sr, eaddr, access_type, + raddrp, protp, mmu_idx, guest_visible); } /* 5. Check for segment level no-execute violation */ if (access_type == MMU_INST_FETCH && (sr & SR32_NX)) { - cs->exception_index = POWERPC_EXCP_ISI; - env->error_code = 0x10000000; - return 1; + if (guest_visible) { + cs->exception_index = POWERPC_EXCP_ISI; + env->error_code = 0x10000000; + } + return false; } /* 6. Locate the PTE in the hash table */ pte_offset = ppc_hash32_htab_lookup(cpu, sr, eaddr, &pte); if (pte_offset == -1) { - if (access_type == MMU_INST_FETCH) { - cs->exception_index = POWERPC_EXCP_ISI; - env->error_code = 0x40000000; - } else { - cs->exception_index = POWERPC_EXCP_DSI; - env->error_code = 0; - env->spr[SPR_DAR] = eaddr; - if (access_type == MMU_DATA_STORE) { - env->spr[SPR_DSISR] = 0x42000000; + if (guest_visible) { + if (access_type == MMU_INST_FETCH) { + cs->exception_index = POWERPC_EXCP_ISI; + env->error_code = 0x40000000; } else { - env->spr[SPR_DSISR] = 0x40000000; + cs->exception_index = POWERPC_EXCP_DSI; + env->error_code = 0; + env->spr[SPR_DAR] = eaddr; + if (access_type == MMU_DATA_STORE) { + env->spr[SPR_DSISR] = 0x42000000; + } else { + env->spr[SPR_DSISR] = 0x40000000; + } } } - - return 1; + return false; } qemu_log_mask(CPU_LOG_MMU, "found PTE at offset %08" HWADDR_PRIx "\n", pte_offset); /* 7. Check access permissions */ - prot = ppc_hash32_pte_prot(cpu, sr, pte); + prot = ppc_hash32_pte_prot(mmu_idx, sr, pte); if (need_prot & ~prot) { /* Access right violation */ qemu_log_mask(CPU_LOG_MMU, "PTE access rejected\n"); - if (access_type == MMU_INST_FETCH) { - cs->exception_index = POWERPC_EXCP_ISI; - env->error_code = 0x08000000; - } else { - cs->exception_index = POWERPC_EXCP_DSI; - env->error_code = 0; - env->spr[SPR_DAR] = eaddr; - if (access_type == MMU_DATA_STORE) { - env->spr[SPR_DSISR] = 0x0a000000; + if (guest_visible) { + if (access_type == MMU_INST_FETCH) { + cs->exception_index = POWERPC_EXCP_ISI; + env->error_code = 0x08000000; } else { - env->spr[SPR_DSISR] = 0x08000000; + cs->exception_index = POWERPC_EXCP_DSI; + env->error_code = 0; + env->spr[SPR_DAR] = eaddr; + if (access_type == MMU_DATA_STORE) { + env->spr[SPR_DSISR] = 0x0a000000; + } else { + env->spr[SPR_DSISR] = 0x08000000; + } } } - return 1; + return false; } qemu_log_mask(CPU_LOG_MMU, "PTE access granted !\n"); @@ -559,45 +565,7 @@ int ppc_hash32_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx, /* 9. Determine the real address from the PTE */ - raddr = ppc_hash32_pte_raddr(sr, pte, eaddr); - - tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK, - prot, mmu_idx, TARGET_PAGE_SIZE); - - return 0; -} - -hwaddr ppc_hash32_get_phys_page_debug(PowerPCCPU *cpu, target_ulong eaddr) -{ - CPUPPCState *env = &cpu->env; - target_ulong sr; - hwaddr pte_offset; - ppc_hash_pte32_t pte; - int prot; - - if (msr_dr == 0) { - /* Translation is off */ - return eaddr; - } - - if (env->nb_BATs != 0) { - hwaddr raddr = ppc_hash32_bat_lookup(cpu, eaddr, 0, &prot); - if (raddr != -1) { - return raddr; - } - } - - sr = env->sr[eaddr >> 28]; - - if (sr & SR32_T) { - /* FIXME: Add suitable debug support for Direct Store segments */ - return -1; - } - - pte_offset = ppc_hash32_htab_lookup(cpu, sr, eaddr, &pte); - if (pte_offset == -1) { - return -1; - } - - return ppc_hash32_pte_raddr(sr, pte, eaddr) & TARGET_PAGE_MASK; + *raddrp = ppc_hash32_pte_raddr(sr, pte, eaddr); + *protp = prot; + return true; } diff --git a/target/ppc/mmu-hash32.h b/target/ppc/mmu-hash32.h index 898021f..3892b69 100644 --- a/target/ppc/mmu-hash32.h +++ b/target/ppc/mmu-hash32.h @@ -4,9 +4,9 @@ #ifndef CONFIG_USER_ONLY hwaddr get_pteg_offset32(PowerPCCPU *cpu, hwaddr hash); -hwaddr ppc_hash32_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr); -int ppc_hash32_handle_mmu_fault(PowerPCCPU *cpu, vaddr address, int rw, - int mmu_idx); +bool ppc_hash32_xlate(PowerPCCPU *cpu, vaddr eaddr, MMUAccessType access_type, + hwaddr *raddrp, int *psizep, int *protp, int mmu_idx, + bool guest_visible); /* * Segment register definitions @@ -22,6 +22,8 @@ int ppc_hash32_handle_mmu_fault(PowerPCCPU *cpu, vaddr address, int rw, * Block Address Translation (BAT) definitions */ +#define BATU32_BEPIU 0xf0000000 +#define BATU32_BEPIL 0x0ffe0000 #define BATU32_BEPI 0xfffe0000 #define BATU32_BL 0x00001ffc #define BATU32_VS 0x00000002 diff --git a/target/ppc/mmu-hash64.c b/target/ppc/mmu-hash64.c index 708dffc..19832c4 100644 --- a/target/ppc/mmu-hash64.c +++ b/target/ppc/mmu-hash64.c @@ -366,10 +366,9 @@ static inline int ppc_hash64_pte_noexec_guard(PowerPCCPU *cpu, } /* Check Basic Storage Protection */ -static int ppc_hash64_pte_prot(PowerPCCPU *cpu, +static int ppc_hash64_pte_prot(int mmu_idx, ppc_slb_t *slb, ppc_hash_pte64_t pte) { - CPUPPCState *env = &cpu->env; unsigned pp, key; /* * Some pp bit combinations have undefined behaviour, so default @@ -377,7 +376,7 @@ static int ppc_hash64_pte_prot(PowerPCCPU *cpu, */ int prot = 0; - key = !!(msr_pr ? (slb->vsid & SLB_VSID_KP) + key = !!(mmuidx_pr(mmu_idx) ? (slb->vsid & SLB_VSID_KP) : (slb->vsid & SLB_VSID_KS)); pp = (pte.pte1 & HPTE64_R_PP) | ((pte.pte1 & HPTE64_R_PP0) >> 61); @@ -744,17 +743,17 @@ static bool ppc_hash64_use_vrma(CPUPPCState *env) } } -static void ppc_hash64_set_isi(CPUState *cs, uint64_t error_code) +static void ppc_hash64_set_isi(CPUState *cs, int mmu_idx, uint64_t error_code) { CPUPPCState *env = &POWERPC_CPU(cs)->env; bool vpm; - if (msr_ir) { + if (!mmuidx_real(mmu_idx)) { vpm = !!(env->spr[SPR_LPCR] & LPCR_VPM1); } else { vpm = ppc_hash64_use_vrma(env); } - if (vpm && !msr_hv) { + if (vpm && !mmuidx_hv(mmu_idx)) { cs->exception_index = POWERPC_EXCP_HISI; } else { cs->exception_index = POWERPC_EXCP_ISI; @@ -762,17 +761,17 @@ static void ppc_hash64_set_isi(CPUState *cs, uint64_t error_code) env->error_code = error_code; } -static void ppc_hash64_set_dsi(CPUState *cs, uint64_t dar, uint64_t dsisr) +static void ppc_hash64_set_dsi(CPUState *cs, int mmu_idx, uint64_t dar, uint64_t dsisr) { CPUPPCState *env = &POWERPC_CPU(cs)->env; bool vpm; - if (msr_dr) { + if (!mmuidx_real(mmu_idx)) { vpm = !!(env->spr[SPR_LPCR] & LPCR_VPM1); } else { vpm = ppc_hash64_use_vrma(env); } - if (vpm && !msr_hv) { + if (vpm && !mmuidx_hv(mmu_idx)) { cs->exception_index = POWERPC_EXCP_HDSI; env->spr[SPR_HDAR] = dar; env->spr[SPR_HDSISR] = dsisr; @@ -873,8 +872,9 @@ static int build_vrma_slbe(PowerPCCPU *cpu, ppc_slb_t *slb) return -1; } -int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, - int rwx, int mmu_idx) +bool ppc_hash64_xlate(PowerPCCPU *cpu, vaddr eaddr, MMUAccessType access_type, + hwaddr *raddrp, int *psizep, int *protp, int mmu_idx, + bool guest_visible) { CPUState *cs = CPU(cpu); CPUPPCState *env = &cpu->env; @@ -884,13 +884,9 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, hwaddr ptex; ppc_hash_pte64_t pte; int exec_prot, pp_prot, amr_prot, prot; - MMUAccessType access_type; int need_prot; hwaddr raddr; - assert((rwx == 0) || (rwx == 1) || (rwx == 2)); - access_type = rwx; - /* * Note on LPCR usage: 970 uses HID4, but our special variant of * store_spr copies relevant fields into env->spr[SPR_LPCR]. @@ -900,7 +896,7 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, */ /* 1. Handle real mode accesses */ - if (access_type == MMU_INST_FETCH ? !msr_ir : !msr_dr) { + if (mmuidx_real(mmu_idx)) { /* * Translation is supposedly "off", but in real mode the top 4 * effective address bits are (mostly) ignored @@ -912,7 +908,7 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, * In virtual hypervisor mode, there's nothing to do: * EA == GPA == qemu guest address */ - } else if (msr_hv || !env->has_hv_mode) { + } else if (mmuidx_hv(mmu_idx) || !env->has_hv_mode) { /* In HV mode, add HRMOR if top EA bit is clear */ if (!(eaddr >> 63)) { raddr |= env->spr[SPR_HRMOR]; @@ -922,9 +918,11 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, slb = &vrma_slbe; if (build_vrma_slbe(cpu, slb) != 0) { /* Invalid VRMA setup, machine check */ - cs->exception_index = POWERPC_EXCP_MCHECK; - env->error_code = 0; - return 1; + if (guest_visible) { + cs->exception_index = POWERPC_EXCP_MCHECK; + env->error_code = 0; + } + return false; } goto skip_slb_search; @@ -933,29 +931,33 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, /* Emulated old-style RMO mode, bounds check against RMLS */ if (raddr >= limit) { + if (!guest_visible) { + return false; + } switch (access_type) { case MMU_INST_FETCH: - ppc_hash64_set_isi(cs, SRR1_PROTFAULT); + ppc_hash64_set_isi(cs, mmu_idx, SRR1_PROTFAULT); break; case MMU_DATA_LOAD: - ppc_hash64_set_dsi(cs, eaddr, DSISR_PROTFAULT); + ppc_hash64_set_dsi(cs, mmu_idx, eaddr, DSISR_PROTFAULT); break; case MMU_DATA_STORE: - ppc_hash64_set_dsi(cs, eaddr, + ppc_hash64_set_dsi(cs, mmu_idx, eaddr, DSISR_PROTFAULT | DSISR_ISSTORE); break; default: g_assert_not_reached(); } - return 1; + return false; } raddr |= env->spr[SPR_RMOR]; } - tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK, - PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx, - TARGET_PAGE_SIZE); - return 0; + + *raddrp = raddr; + *protp = PAGE_READ | PAGE_WRITE | PAGE_EXEC; + *psizep = TARGET_PAGE_BITS; + return true; } /* 2. Translation is on, so look up the SLB */ @@ -968,6 +970,9 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, exit(1); } /* Segment still not found, generate the appropriate interrupt */ + if (!guest_visible) { + return false; + } switch (access_type) { case MMU_INST_FETCH: cs->exception_index = POWERPC_EXCP_ISEG; @@ -982,34 +987,39 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, default: g_assert_not_reached(); } - return 1; + return false; } -skip_slb_search: + skip_slb_search: /* 3. Check for segment level no-execute violation */ if (access_type == MMU_INST_FETCH && (slb->vsid & SLB_VSID_N)) { - ppc_hash64_set_isi(cs, SRR1_NOEXEC_GUARD); - return 1; + if (guest_visible) { + ppc_hash64_set_isi(cs, mmu_idx, SRR1_NOEXEC_GUARD); + } + return false; } /* 4. Locate the PTE in the hash table */ ptex = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte, &apshift); if (ptex == -1) { + if (!guest_visible) { + return false; + } switch (access_type) { case MMU_INST_FETCH: - ppc_hash64_set_isi(cs, SRR1_NOPTE); + ppc_hash64_set_isi(cs, mmu_idx, SRR1_NOPTE); break; case MMU_DATA_LOAD: - ppc_hash64_set_dsi(cs, eaddr, DSISR_NOPTE); + ppc_hash64_set_dsi(cs, mmu_idx, eaddr, DSISR_NOPTE); break; case MMU_DATA_STORE: - ppc_hash64_set_dsi(cs, eaddr, DSISR_NOPTE | DSISR_ISSTORE); + ppc_hash64_set_dsi(cs, mmu_idx, eaddr, DSISR_NOPTE | DSISR_ISSTORE); break; default: g_assert_not_reached(); } - return 1; + return false; } qemu_log_mask(CPU_LOG_MMU, "found PTE at index %08" HWADDR_PRIx "\n", ptex); @@ -1017,7 +1027,7 @@ skip_slb_search: /* 5. Check access permissions */ exec_prot = ppc_hash64_pte_noexec_guard(cpu, pte); - pp_prot = ppc_hash64_pte_prot(cpu, slb, pte); + pp_prot = ppc_hash64_pte_prot(mmu_idx, slb, pte); amr_prot = ppc_hash64_amr_prot(cpu, pte); prot = exec_prot & pp_prot & amr_prot; @@ -1025,6 +1035,9 @@ skip_slb_search: if (need_prot & ~prot) { /* Access right violation */ qemu_log_mask(CPU_LOG_MMU, "PTE access rejected\n"); + if (!guest_visible) { + return false; + } if (access_type == MMU_INST_FETCH) { int srr1 = 0; if (PAGE_EXEC & ~exec_prot) { @@ -1035,7 +1048,7 @@ skip_slb_search: if (PAGE_EXEC & ~amr_prot) { srr1 |= SRR1_IAMR; /* Access violates virt pg class key prot */ } - ppc_hash64_set_isi(cs, srr1); + ppc_hash64_set_isi(cs, mmu_idx, srr1); } else { int dsisr = 0; if (need_prot & ~pp_prot) { @@ -1047,9 +1060,9 @@ skip_slb_search: if (need_prot & ~amr_prot) { dsisr |= DSISR_AMR; } - ppc_hash64_set_dsi(cs, eaddr, dsisr); + ppc_hash64_set_dsi(cs, mmu_idx, eaddr, dsisr); } - return 1; + return false; } qemu_log_mask(CPU_LOG_MMU, "PTE access granted !\n"); @@ -1073,66 +1086,10 @@ skip_slb_search: /* 7. Determine the real address from the PTE */ - raddr = deposit64(pte.pte1 & HPTE64_R_RPN, 0, apshift, eaddr); - - tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK, - prot, mmu_idx, 1ULL << apshift); - - return 0; -} - -hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr) -{ - CPUPPCState *env = &cpu->env; - ppc_slb_t vrma_slbe; - ppc_slb_t *slb; - hwaddr ptex, raddr; - ppc_hash_pte64_t pte; - unsigned apshift; - - /* Handle real mode */ - if (msr_dr == 0) { - /* In real mode the top 4 effective address bits are ignored */ - raddr = addr & 0x0FFFFFFFFFFFFFFFULL; - - if (cpu->vhyp) { - /* - * In virtual hypervisor mode, there's nothing to do: - * EA == GPA == qemu guest address - */ - return raddr; - } else if ((msr_hv || !env->has_hv_mode) && !(addr >> 63)) { - /* In HV mode, add HRMOR if top EA bit is clear */ - return raddr | env->spr[SPR_HRMOR]; - } else if (ppc_hash64_use_vrma(env)) { - /* Emulated VRMA mode */ - slb = &vrma_slbe; - if (build_vrma_slbe(cpu, slb) != 0) { - return -1; - } - } else { - target_ulong limit = rmls_limit(cpu); - - /* Emulated old-style RMO mode, bounds check against RMLS */ - if (raddr >= limit) { - return -1; - } - return raddr | env->spr[SPR_RMOR]; - } - } else { - slb = slb_lookup(cpu, addr); - if (!slb) { - return -1; - } - } - - ptex = ppc_hash64_htab_lookup(cpu, slb, addr, &pte, &apshift); - if (ptex == -1) { - return -1; - } - - return deposit64(pte.pte1 & HPTE64_R_RPN, 0, apshift, addr) - & TARGET_PAGE_MASK; + *raddrp = deposit64(pte.pte1 & HPTE64_R_RPN, 0, apshift, eaddr); + *protp = prot; + *psizep = apshift; + return true; } void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu, target_ulong ptex, diff --git a/target/ppc/mmu-hash64.h b/target/ppc/mmu-hash64.h index 4b8b8e7..c5b2f97 100644 --- a/target/ppc/mmu-hash64.h +++ b/target/ppc/mmu-hash64.h @@ -7,9 +7,9 @@ void dump_slb(PowerPCCPU *cpu); int ppc_store_slb(PowerPCCPU *cpu, target_ulong slot, target_ulong esid, target_ulong vsid); -hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr); -int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr address, int rw, - int mmu_idx); +bool ppc_hash64_xlate(PowerPCCPU *cpu, vaddr eaddr, MMUAccessType access_type, + hwaddr *raddrp, int *psizep, int *protp, int mmu_idx, + bool guest_visible); void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu, target_ulong pte_index, target_ulong pte0, target_ulong pte1); diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c index b6d191c..5b0e62e 100644 --- a/target/ppc/mmu-radix64.c +++ b/target/ppc/mmu-radix64.c @@ -155,7 +155,7 @@ static void ppc_radix64_raise_hsi(PowerPCCPU *cpu, MMUAccessType access_type, static bool ppc_radix64_check_prot(PowerPCCPU *cpu, MMUAccessType access_type, uint64_t pte, int *fault_cause, int *prot, - bool partition_scoped) + int mmu_idx, bool partition_scoped) { CPUPPCState *env = &cpu->env; int need_prot; @@ -173,7 +173,8 @@ static bool ppc_radix64_check_prot(PowerPCCPU *cpu, MMUAccessType access_type, /* Determine permissions allowed by Encoded Access Authority */ if (!partition_scoped && (pte & R_PTE_EAA_PRIV) && msr_pr) { *prot = 0; - } else if (msr_pr || (pte & R_PTE_EAA_PRIV) || partition_scoped) { + } else if (mmuidx_pr(mmu_idx) || (pte & R_PTE_EAA_PRIV) || + partition_scoped) { *prot = ppc_radix64_get_prot_eaa(pte); } else { /* !msr_pr && !(pte & R_PTE_EAA_PRIV) && !partition_scoped */ *prot = ppc_radix64_get_prot_eaa(pte); @@ -299,7 +300,7 @@ static int ppc_radix64_partition_scoped_xlate(PowerPCCPU *cpu, ppc_v3_pate_t pate, hwaddr *h_raddr, int *h_prot, int *h_page_size, bool pde_addr, - bool guest_visible) + int mmu_idx, bool guest_visible) { int fault_cause = 0; hwaddr pte_addr; @@ -310,7 +311,8 @@ static int ppc_radix64_partition_scoped_xlate(PowerPCCPU *cpu, if (ppc_radix64_walk_tree(CPU(cpu)->as, g_raddr, pate.dw0 & PRTBE_R_RPDB, pate.dw0 & PRTBE_R_RPDS, h_raddr, h_page_size, &pte, &fault_cause, &pte_addr) || - ppc_radix64_check_prot(cpu, access_type, pte, &fault_cause, h_prot, true)) { + ppc_radix64_check_prot(cpu, access_type, pte, + &fault_cause, h_prot, mmu_idx, true)) { if (pde_addr) { /* address being translated was that of a guest pde */ fault_cause |= DSISR_PRTABLE_FAULT; } @@ -332,7 +334,7 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu, vaddr eaddr, uint64_t pid, ppc_v3_pate_t pate, hwaddr *g_raddr, int *g_prot, int *g_page_size, - bool guest_visible) + int mmu_idx, bool guest_visible) { CPUState *cs = CPU(cpu); CPUPPCState *env = &cpu->env; @@ -367,7 +369,8 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu, ret = ppc_radix64_partition_scoped_xlate(cpu, 0, eaddr, prtbe_addr, pate, &h_raddr, &h_prot, &h_page_size, true, - guest_visible); + /* mmu_idx is 5 because we're translating from hypervisor scope */ + 5, guest_visible); if (ret) { return ret; } @@ -407,7 +410,8 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu, ret = ppc_radix64_partition_scoped_xlate(cpu, 0, eaddr, pte_addr, pate, &h_raddr, &h_prot, &h_page_size, true, - guest_visible); + /* mmu_idx is 5 because we're translating from hypervisor scope */ + 5, guest_visible); if (ret) { return ret; } @@ -431,7 +435,8 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu, *g_raddr = (rpn & ~mask) | (eaddr & mask); } - if (ppc_radix64_check_prot(cpu, access_type, pte, &fault_cause, g_prot, false)) { + if (ppc_radix64_check_prot(cpu, access_type, pte, &fault_cause, + g_prot, mmu_idx, false)) { /* Access denied due to protection */ if (guest_visible) { ppc_radix64_raise_si(cpu, access_type, eaddr, fault_cause); @@ -463,24 +468,53 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu, * | = On | Process Scoped | Scoped | * +-------------+----------------+---------------+ */ -static int ppc_radix64_xlate(PowerPCCPU *cpu, vaddr eaddr, - MMUAccessType access_type, - bool relocation, - hwaddr *raddr, int *psizep, int *protp, - bool guest_visible) +bool ppc_radix64_xlate(PowerPCCPU *cpu, vaddr eaddr, MMUAccessType access_type, + hwaddr *raddr, int *psizep, int *protp, int mmu_idx, + bool guest_visible) { CPUPPCState *env = &cpu->env; uint64_t lpid, pid; ppc_v3_pate_t pate; int psize, prot; hwaddr g_raddr; + bool relocation; + + assert(!(mmuidx_hv(mmu_idx) && cpu->vhyp)); + + relocation = !mmuidx_real(mmu_idx); + + /* HV or virtual hypervisor Real Mode Access */ + if (!relocation && (mmuidx_hv(mmu_idx) || cpu->vhyp)) { + /* In real mode top 4 effective addr bits (mostly) ignored */ + *raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL; + + /* In HV mode, add HRMOR if top EA bit is clear */ + if (mmuidx_hv(mmu_idx) || !env->has_hv_mode) { + if (!(eaddr >> 63)) { + *raddr |= env->spr[SPR_HRMOR]; + } + } + *protp = PAGE_READ | PAGE_WRITE | PAGE_EXEC; + *psizep = TARGET_PAGE_BITS; + return true; + } + + /* + * Check UPRT (we avoid the check in real mode to deal with + * transitional states during kexec. + */ + if (guest_visible && !ppc64_use_proc_tbl(cpu)) { + qemu_log_mask(LOG_GUEST_ERROR, + "LPCR:UPRT not set in radix mode ! LPCR=" + TARGET_FMT_lx "\n", env->spr[SPR_LPCR]); + } /* Virtual Mode Access - get the fully qualified address */ if (!ppc_radix64_get_fully_qualified_addr(&cpu->env, eaddr, &lpid, &pid)) { if (guest_visible) { ppc_radix64_raise_segi(cpu, access_type, eaddr); } - return 1; + return false; } /* Get Process Table */ @@ -493,13 +527,13 @@ static int ppc_radix64_xlate(PowerPCCPU *cpu, vaddr eaddr, if (guest_visible) { ppc_radix64_raise_si(cpu, access_type, eaddr, DSISR_NOPTE); } - return 1; + return false; } if (!validate_pate(cpu, lpid, &pate)) { if (guest_visible) { ppc_radix64_raise_si(cpu, access_type, eaddr, DSISR_R_BADCONFIG); } - return 1; + return false; } } @@ -517,9 +551,9 @@ static int ppc_radix64_xlate(PowerPCCPU *cpu, vaddr eaddr, if (relocation) { int ret = ppc_radix64_process_scoped_xlate(cpu, access_type, eaddr, pid, pate, &g_raddr, &prot, - &psize, guest_visible); + &psize, mmu_idx, guest_visible); if (ret) { - return ret; + return false; } *psizep = MIN(*psizep, psize); *protp &= prot; @@ -535,15 +569,15 @@ static int ppc_radix64_xlate(PowerPCCPU *cpu, vaddr eaddr, * quadrants 1 or 2. Translates a guest real address to a host * real address. */ - if (lpid || !msr_hv) { + if (lpid || !mmuidx_hv(mmu_idx)) { int ret; ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr, g_raddr, pate, raddr, &prot, &psize, false, - guest_visible); + mmu_idx, guest_visible); if (ret) { - return ret; + return false; } *psizep = MIN(*psizep, psize); *protp &= prot; @@ -552,78 +586,5 @@ static int ppc_radix64_xlate(PowerPCCPU *cpu, vaddr eaddr, } } - return 0; -} - -int ppc_radix64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx, - int mmu_idx) -{ - CPUState *cs = CPU(cpu); - CPUPPCState *env = &cpu->env; - int page_size, prot; - bool relocation; - MMUAccessType access_type; - hwaddr raddr; - - assert(!(msr_hv && cpu->vhyp)); - assert((rwx == 0) || (rwx == 1) || (rwx == 2)); - access_type = rwx; - - relocation = (access_type == MMU_INST_FETCH ? msr_ir : msr_dr); - /* HV or virtual hypervisor Real Mode Access */ - if (!relocation && (msr_hv || cpu->vhyp)) { - /* In real mode top 4 effective addr bits (mostly) ignored */ - raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL; - - /* In HV mode, add HRMOR if top EA bit is clear */ - if (msr_hv || !env->has_hv_mode) { - if (!(eaddr >> 63)) { - raddr |= env->spr[SPR_HRMOR]; - } - } - tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK, - PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx, - TARGET_PAGE_SIZE); - return 0; - } - - /* - * Check UPRT (we avoid the check in real mode to deal with - * transitional states during kexec. - */ - if (!ppc64_use_proc_tbl(cpu)) { - qemu_log_mask(LOG_GUEST_ERROR, - "LPCR:UPRT not set in radix mode ! LPCR=" - TARGET_FMT_lx "\n", env->spr[SPR_LPCR]); - } - - /* Translate eaddr to raddr (where raddr is addr qemu needs for access) */ - if (ppc_radix64_xlate(cpu, eaddr, access_type, relocation, &raddr, - &page_size, &prot, true)) { - return 1; - } - - tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK, - prot, mmu_idx, 1UL << page_size); - return 0; -} - -hwaddr ppc_radix64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong eaddr) -{ - CPUPPCState *env = &cpu->env; - int psize, prot; - hwaddr raddr; - - /* Handle Real Mode */ - if ((msr_dr == 0) && (msr_hv || cpu->vhyp)) { - /* In real mode top 4 effective addr bits (mostly) ignored */ - return eaddr & 0x0FFFFFFFFFFFFFFFULL; - } - - if (ppc_radix64_xlate(cpu, eaddr, 0, msr_dr, &raddr, &psize, - &prot, false)) { - return -1; - } - - return raddr & TARGET_PAGE_MASK; + return true; } diff --git a/target/ppc/mmu-radix64.h b/target/ppc/mmu-radix64.h index f28c579..b70357c 100644 --- a/target/ppc/mmu-radix64.h +++ b/target/ppc/mmu-radix64.h @@ -44,9 +44,9 @@ #ifdef TARGET_PPC64 -int ppc_radix64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx, - int mmu_idx); -hwaddr ppc_radix64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr); +bool ppc_radix64_xlate(PowerPCCPU *cpu, vaddr eaddr, MMUAccessType access_type, + hwaddr *raddr, int *psizep, int *protp, int mmu_idx, + bool guest_visible); static inline int ppc_radix64_get_prot_eaa(uint64_t pte) { diff --git a/target/ppc/mmu_helper.c b/target/ppc/mmu_helper.c index 1ecb36e..869d24d 100644 --- a/target/ppc/mmu_helper.c +++ b/target/ppc/mmu_helper.c @@ -511,7 +511,7 @@ static int get_segment_6xx_tlb(CPUPPCState *env, mmu_ctx_t *ctx, qemu_log("Page table: " TARGET_FMT_plx " len " TARGET_FMT_plx "\n", ppc_hash32_hpt_base(cpu), - ppc_hash32_hpt_mask(env) + 0x80); + ppc_hash32_hpt_mask(cpu) + 0x80); for (curaddr = ppc_hash32_hpt_base(cpu); curaddr < (ppc_hash32_hpt_base(cpu) + ppc_hash32_hpt_mask(cpu) + 0x80); @@ -825,6 +825,7 @@ static int mmubooke_get_physical_address(CPUPPCState *env, mmu_ctx_t *ctx, return ret; } +#ifdef CONFIG_TCG static void booke206_flush_tlb(CPUPPCState *env, int flags, const int check_iprot) { @@ -846,6 +847,7 @@ static void booke206_flush_tlb(CPUPPCState *env, int flags, tlb_flush(env_cpu(env)); } +#endif static hwaddr booke206_tlb_to_page_size(CPUPPCState *env, ppcmas_tlb_t *tlb) @@ -1435,48 +1437,6 @@ static int get_physical_address(CPUPPCState *env, mmu_ctx_t *ctx, } #endif -hwaddr ppc_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) -{ - PowerPCCPU *cpu = POWERPC_CPU(cs); - CPUPPCState *env = &cpu->env; - mmu_ctx_t ctx; - - switch (env->mmu_model) { -#if defined(TARGET_PPC64) - case POWERPC_MMU_64B: - case POWERPC_MMU_2_03: - case POWERPC_MMU_2_06: - case POWERPC_MMU_2_07: - return ppc_hash64_get_phys_page_debug(cpu, addr); - case POWERPC_MMU_3_00: - return ppc64_v3_get_phys_page_debug(cpu, addr); -#endif - - case POWERPC_MMU_32B: - case POWERPC_MMU_601: - return ppc_hash32_get_phys_page_debug(cpu, addr); - - default: - ; - } - - if (unlikely(get_physical_address(env, &ctx, addr, MMU_DATA_LOAD, - ACCESS_INT) != 0)) { - - /* - * Some MMUs have separate TLBs for code and data. If we only - * try an ACCESS_INT, we may not be able to read instructions - * mapped by code TLBs, so we also try a ACCESS_CODE. - */ - if (unlikely(get_physical_address(env, &ctx, addr, MMU_INST_FETCH, - ACCESS_CODE) != 0)) { - return -1; - } - } - - return ctx.raddr & TARGET_PAGE_MASK; -} - static void booke206_update_mas_tlb_miss(CPUPPCState *env, target_ulong address, MMUAccessType access_type, int mmu_idx) { @@ -1532,30 +1492,38 @@ static void booke206_update_mas_tlb_miss(CPUPPCState *env, target_ulong address, } /* Perform address translation */ -static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, - MMUAccessType access_type, int mmu_idx) +/* TODO: Split this by mmu_model. */ +static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr eaddr, + MMUAccessType access_type, + hwaddr *raddrp, int *psizep, int *protp, + int mmu_idx, bool guest_visible) { - CPUState *cs = env_cpu(env); - PowerPCCPU *cpu = POWERPC_CPU(cs); + CPUState *cs = CPU(cpu); + CPUPPCState *env = &cpu->env; mmu_ctx_t ctx; int type; - int ret = 0; + int ret; if (access_type == MMU_INST_FETCH) { /* code access */ type = ACCESS_CODE; - } else { + } else if (guest_visible) { /* data access */ type = env->access_type; + } else { + type = ACCESS_INT; } - ret = get_physical_address_wtlb(env, &ctx, address, access_type, + + ret = get_physical_address_wtlb(env, &ctx, eaddr, access_type, type, mmu_idx); if (ret == 0) { - tlb_set_page(cs, address & TARGET_PAGE_MASK, - ctx.raddr & TARGET_PAGE_MASK, ctx.prot, - mmu_idx, TARGET_PAGE_SIZE); - ret = 0; - } else if (ret < 0) { + *raddrp = ctx.raddr; + *protp = ctx.prot; + *psizep = TARGET_PAGE_BITS; + return true; + } + + if (guest_visible) { LOG_MMU_STATE(cs); if (type == ACCESS_CODE) { switch (ret) { @@ -1565,7 +1533,7 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, case POWERPC_MMU_SOFT_6xx: cs->exception_index = POWERPC_EXCP_IFTLB; env->error_code = 1 << 18; - env->spr[SPR_IMISS] = address; + env->spr[SPR_IMISS] = eaddr; env->spr[SPR_ICMP] = 0x80000000 | ctx.ptem; goto tlb_miss; case POWERPC_MMU_SOFT_74xx: @@ -1575,29 +1543,25 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, case POWERPC_MMU_SOFT_4xx_Z: cs->exception_index = POWERPC_EXCP_ITLB; env->error_code = 0; - env->spr[SPR_40x_DEAR] = address; + env->spr[SPR_40x_DEAR] = eaddr; env->spr[SPR_40x_ESR] = 0x00000000; break; case POWERPC_MMU_BOOKE206: - booke206_update_mas_tlb_miss(env, address, 2, mmu_idx); + booke206_update_mas_tlb_miss(env, eaddr, 2, mmu_idx); /* fall through */ case POWERPC_MMU_BOOKE: cs->exception_index = POWERPC_EXCP_ITLB; env->error_code = 0; - env->spr[SPR_BOOKE_DEAR] = address; + env->spr[SPR_BOOKE_DEAR] = eaddr; env->spr[SPR_BOOKE_ESR] = mmubooke206_esr(mmu_idx, MMU_DATA_LOAD); - return -1; + break; case POWERPC_MMU_MPC8xx: - /* XXX: TODO */ cpu_abort(cs, "MPC8xx MMU model is not implemented\n"); - break; case POWERPC_MMU_REAL: cpu_abort(cs, "PowerPC in real mode should never raise " "any MMU exceptions\n"); - return -1; default: cpu_abort(cs, "Unknown or invalid MMU model\n"); - return -1; } break; case -2: @@ -1634,7 +1598,7 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, cs->exception_index = POWERPC_EXCP_DLTLB; env->error_code = 0; } - env->spr[SPR_DMISS] = address; + env->spr[SPR_DMISS] = eaddr; env->spr[SPR_DCMP] = 0x80000000 | ctx.ptem; tlb_miss: env->error_code |= ctx.key << 19; @@ -1652,7 +1616,7 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, tlb_miss_74xx: /* Implement LRU algorithm */ env->error_code = ctx.key << 19; - env->spr[SPR_TLBMISS] = (address & ~((target_ulong)0x3)) | + env->spr[SPR_TLBMISS] = (eaddr & ~((target_ulong)0x3)) | ((env->last_way + 1) & (env->nb_ways - 1)); env->spr[SPR_PTEHI] = 0x80000000 | ctx.ptem; break; @@ -1660,7 +1624,7 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, case POWERPC_MMU_SOFT_4xx_Z: cs->exception_index = POWERPC_EXCP_DTLB; env->error_code = 0; - env->spr[SPR_40x_DEAR] = address; + env->spr[SPR_40x_DEAR] = eaddr; if (access_type == MMU_DATA_STORE) { env->spr[SPR_40x_ESR] = 0x00800000; } else { @@ -1670,23 +1634,20 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, case POWERPC_MMU_MPC8xx: /* XXX: TODO */ cpu_abort(cs, "MPC8xx MMU model is not implemented\n"); - break; case POWERPC_MMU_BOOKE206: - booke206_update_mas_tlb_miss(env, address, access_type, mmu_idx); + booke206_update_mas_tlb_miss(env, eaddr, access_type, mmu_idx); /* fall through */ case POWERPC_MMU_BOOKE: cs->exception_index = POWERPC_EXCP_DTLB; env->error_code = 0; - env->spr[SPR_BOOKE_DEAR] = address; + env->spr[SPR_BOOKE_DEAR] = eaddr; env->spr[SPR_BOOKE_ESR] = mmubooke206_esr(mmu_idx, access_type); - return -1; + break; case POWERPC_MMU_REAL: cpu_abort(cs, "PowerPC in real mode should never raise " "any MMU exceptions\n"); - return -1; default: cpu_abort(cs, "Unknown or invalid MMU model\n"); - return -1; } break; case -2: @@ -1695,16 +1656,16 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, env->error_code = 0; if (env->mmu_model == POWERPC_MMU_SOFT_4xx || env->mmu_model == POWERPC_MMU_SOFT_4xx_Z) { - env->spr[SPR_40x_DEAR] = address; + env->spr[SPR_40x_DEAR] = eaddr; if (access_type == MMU_DATA_STORE) { env->spr[SPR_40x_ESR] |= 0x00800000; } } else if ((env->mmu_model == POWERPC_MMU_BOOKE) || (env->mmu_model == POWERPC_MMU_BOOKE206)) { - env->spr[SPR_BOOKE_DEAR] = address; + env->spr[SPR_BOOKE_DEAR] = eaddr; env->spr[SPR_BOOKE_ESR] = mmubooke206_esr(mmu_idx, access_type); } else { - env->spr[SPR_DAR] = address; + env->spr[SPR_DAR] = eaddr; if (access_type == MMU_DATA_STORE) { env->spr[SPR_DSISR] = 0x0A000000; } else { @@ -1719,13 +1680,13 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, /* Floating point load/store */ cs->exception_index = POWERPC_EXCP_ALIGN; env->error_code = POWERPC_EXCP_ALIGN_FP; - env->spr[SPR_DAR] = address; + env->spr[SPR_DAR] = eaddr; break; case ACCESS_RES: /* lwarx, ldarx or stwcx. */ cs->exception_index = POWERPC_EXCP_DSI; env->error_code = 0; - env->spr[SPR_DAR] = address; + env->spr[SPR_DAR] = eaddr; if (access_type == MMU_DATA_STORE) { env->spr[SPR_DSISR] = 0x06000000; } else { @@ -1736,7 +1697,7 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, /* eciwx or ecowx */ cs->exception_index = POWERPC_EXCP_DSI; env->error_code = 0; - env->spr[SPR_DAR] = address; + env->spr[SPR_DAR] = eaddr; if (access_type == MMU_DATA_STORE) { env->spr[SPR_DSISR] = 0x06100000; } else { @@ -1748,16 +1709,14 @@ static int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, cs->exception_index = POWERPC_EXCP_PROGRAM; env->error_code = POWERPC_EXCP_INVAL | POWERPC_EXCP_INVAL_INVAL; - env->spr[SPR_DAR] = address; + env->spr[SPR_DAR] = eaddr; break; } break; } } - ret = 1; } - - return ret; + return false; } #ifdef CONFIG_TCG @@ -1798,9 +1757,6 @@ static inline void dump_store_bat(CPUPPCState *env, char ID, int ul, int nr, void helper_store_ibatu(CPUPPCState *env, uint32_t nr, target_ulong value) { target_ulong mask; -#if defined(FLUSH_ALL_TLBS) - PowerPCCPU *cpu = env_archcpu(env); -#endif dump_store_bat(env, 'I', 0, nr, value); if (env->IBAT[0][nr] != value) { @@ -1834,9 +1790,6 @@ void helper_store_ibatl(CPUPPCState *env, uint32_t nr, target_ulong value) void helper_store_dbatu(CPUPPCState *env, uint32_t nr, target_ulong value) { target_ulong mask; -#if defined(FLUSH_ALL_TLBS) - PowerPCCPU *cpu = env_archcpu(env); -#endif dump_store_bat(env, 'D', 0, nr, value); if (env->DBAT[0][nr] != value) { @@ -1871,7 +1824,6 @@ void helper_store_601_batu(CPUPPCState *env, uint32_t nr, target_ulong value) { target_ulong mask; #if defined(FLUSH_ALL_TLBS) - PowerPCCPU *cpu = env_archcpu(env); int do_inval; #endif @@ -1916,7 +1868,6 @@ void helper_store_601_batl(CPUPPCState *env, uint32_t nr, target_ulong value) #if !defined(FLUSH_ALL_TLBS) target_ulong mask; #else - PowerPCCPU *cpu = env_archcpu(env); int do_inval; #endif @@ -1952,6 +1903,7 @@ void helper_store_601_batl(CPUPPCState *env, uint32_t nr, target_ulong value) } #endif +#ifdef CONFIG_TCG /*****************************************************************************/ /* TLB management */ void ppc_tlb_invalidate_all(CPUPPCState *env) @@ -1995,6 +1947,7 @@ void ppc_tlb_invalidate_all(CPUPPCState *env) break; } } +#endif #ifdef CONFIG_TCG void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr) @@ -2942,26 +2895,76 @@ void helper_check_tlb_flush_global(CPUPPCState *env) /*****************************************************************************/ -bool ppc_cpu_tlb_fill(CPUState *cs, vaddr addr, int size, +static bool ppc_xlate(PowerPCCPU *cpu, vaddr eaddr, MMUAccessType access_type, + hwaddr *raddrp, int *psizep, int *protp, + int mmu_idx, bool guest_visible) +{ + switch (cpu->env.mmu_model) { +#if defined(TARGET_PPC64) + case POWERPC_MMU_3_00: + if (ppc64_v3_radix(cpu)) { + return ppc_radix64_xlate(cpu, eaddr, access_type, + raddrp, psizep, protp, mmu_idx, guest_visible); + } + /* fall through */ + case POWERPC_MMU_64B: + case POWERPC_MMU_2_03: + case POWERPC_MMU_2_06: + case POWERPC_MMU_2_07: + return ppc_hash64_xlate(cpu, eaddr, access_type, + raddrp, psizep, protp, mmu_idx, guest_visible); +#endif + + case POWERPC_MMU_32B: + case POWERPC_MMU_601: + return ppc_hash32_xlate(cpu, eaddr, access_type, + raddrp, psizep, protp, mmu_idx, guest_visible); + + default: + return ppc_jumbo_xlate(cpu, eaddr, access_type, raddrp, + psizep, protp, mmu_idx, guest_visible); + } +} + +hwaddr ppc_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) +{ + PowerPCCPU *cpu = POWERPC_CPU(cs); + hwaddr raddr; + int s, p; + + /* + * Some MMUs have separate TLBs for code and data. If we only + * try an MMU_DATA_LOAD, we may not be able to read instructions + * mapped by code TLBs, so we also try a MMU_INST_FETCH. + */ + if (ppc_xlate(cpu, addr, MMU_DATA_LOAD, &raddr, &s, &p, + cpu_mmu_index(&cpu->env, false), false) || + ppc_xlate(cpu, addr, MMU_INST_FETCH, &raddr, &s, &p, + cpu_mmu_index(&cpu->env, true), false)) { + return raddr & TARGET_PAGE_MASK; + } + return -1; +} + +#ifdef CONFIG_TCG +bool ppc_cpu_tlb_fill(CPUState *cs, vaddr eaddr, int size, MMUAccessType access_type, int mmu_idx, bool probe, uintptr_t retaddr) { PowerPCCPU *cpu = POWERPC_CPU(cs); - PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs); - CPUPPCState *env = &cpu->env; - int ret; + hwaddr raddr; + int page_size, prot; - if (pcc->handle_mmu_fault) { - ret = pcc->handle_mmu_fault(cpu, addr, access_type, mmu_idx); - } else { - ret = cpu_ppc_handle_mmu_fault(env, addr, access_type, mmu_idx); + if (ppc_xlate(cpu, eaddr, access_type, &raddr, + &page_size, &prot, mmu_idx, !probe)) { + tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK, + prot, mmu_idx, 1UL << page_size); + return true; } - if (unlikely(ret != 0)) { - if (probe) { - return false; - } - raise_exception_err_ra(env, cs->exception_index, env->error_code, - retaddr); + if (probe) { + return false; } - return true; + raise_exception_err_ra(&cpu->env, cs->exception_index, + cpu->env.error_code, retaddr); } +#endif diff --git a/target/ppc/translate.c b/target/ppc/translate.c index f65d1e8..d1f482b 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -4940,6 +4940,11 @@ static void gen_mtcrf(DisasContext *ctx) #if defined(TARGET_PPC64) static void gen_mtmsrd(DisasContext *ctx) { + if (unlikely(!is_book3s_arch2x(ctx))) { + gen_invalid(ctx); + return; + } + CHK_SV; #if !defined(CONFIG_USER_ONLY) diff --git a/tests/data/acpi/pc/DSDT b/tests/data/acpi/pc/DSDT Binary files differindex b9dd9b3..cc12237 100644 --- a/tests/data/acpi/pc/DSDT +++ b/tests/data/acpi/pc/DSDT diff --git a/tests/data/acpi/pc/DSDT.acpihmat b/tests/data/acpi/pc/DSDT.acpihmat Binary files differindex cba5a1d..2d0678e 100644 --- a/tests/data/acpi/pc/DSDT.acpihmat +++ b/tests/data/acpi/pc/DSDT.acpihmat diff --git a/tests/data/acpi/pc/DSDT.bridge b/tests/data/acpi/pc/DSDT.bridge Binary files differindex a9b4d56..77778c3 100644 --- a/tests/data/acpi/pc/DSDT.bridge +++ b/tests/data/acpi/pc/DSDT.bridge diff --git a/tests/data/acpi/pc/DSDT.cphp b/tests/data/acpi/pc/DSDT.cphp Binary files differindex 8d86155..af046b4 100644 --- a/tests/data/acpi/pc/DSDT.cphp +++ b/tests/data/acpi/pc/DSDT.cphp diff --git a/tests/data/acpi/pc/DSDT.dimmpxm b/tests/data/acpi/pc/DSDT.dimmpxm Binary files differindex e00a447..b56b2e0 100644 --- a/tests/data/acpi/pc/DSDT.dimmpxm +++ b/tests/data/acpi/pc/DSDT.dimmpxm diff --git a/tests/data/acpi/pc/DSDT.hpbridge b/tests/data/acpi/pc/DSDT.hpbridge Binary files differindex 5d8ba19..bb0593e 100644 --- a/tests/data/acpi/pc/DSDT.hpbridge +++ b/tests/data/acpi/pc/DSDT.hpbridge diff --git a/tests/data/acpi/pc/DSDT.ipmikcs b/tests/data/acpi/pc/DSDT.ipmikcs Binary files differindex 01e53bd..2e618e4 100644 --- a/tests/data/acpi/pc/DSDT.ipmikcs +++ b/tests/data/acpi/pc/DSDT.ipmikcs diff --git a/tests/data/acpi/pc/DSDT.memhp b/tests/data/acpi/pc/DSDT.memhp Binary files differindex b810379..c32d285 100644 --- a/tests/data/acpi/pc/DSDT.memhp +++ b/tests/data/acpi/pc/DSDT.memhp diff --git a/tests/data/acpi/pc/DSDT.nohpet b/tests/data/acpi/pc/DSDT.nohpet Binary files differindex d4f0050..623f06a 100644 --- a/tests/data/acpi/pc/DSDT.nohpet +++ b/tests/data/acpi/pc/DSDT.nohpet diff --git a/tests/data/acpi/pc/DSDT.numamem b/tests/data/acpi/pc/DSDT.numamem Binary files differindex 8632dfe..f0a3fa9 100644 --- a/tests/data/acpi/pc/DSDT.numamem +++ b/tests/data/acpi/pc/DSDT.numamem diff --git a/tests/qemu-iotests/172.out b/tests/qemu-iotests/172.out index d53f61d..4cf4d53 100644 --- a/tests/qemu-iotests/172.out +++ b/tests/qemu-iotests/172.out @@ -21,6 +21,7 @@ Testing: dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -48,6 +49,7 @@ Testing: -fda TEST_DIR/t.qcow2 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -85,6 +87,7 @@ Testing: -fdb TEST_DIR/t.qcow2 dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -96,6 +99,7 @@ Testing: -fdb TEST_DIR/t.qcow2 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -137,6 +141,7 @@ Testing: -fda TEST_DIR/t.qcow2 -fdb TEST_DIR/t.qcow2.2 dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -148,6 +153,7 @@ Testing: -fda TEST_DIR/t.qcow2 -fdb TEST_DIR/t.qcow2.2 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -190,6 +196,7 @@ Testing: -fdb dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -201,6 +208,7 @@ Testing: -fdb dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -228,6 +236,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -265,6 +274,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2,index=1 dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -276,6 +286,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2,index=1 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -317,6 +328,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=floppy,file=TEST_DIR/t dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -328,6 +340,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=floppy,file=TEST_DIR/t dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -373,6 +386,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0 dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -410,6 +424,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,unit=1 dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -447,6 +462,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qco dev: floppy, id "" unit = 1 (0x1) drive = "none1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -458,6 +474,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qco dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -509,6 +526,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -520,6 +538,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -562,6 +581,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -573,6 +593,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -615,6 +636,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -626,6 +648,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -668,6 +691,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -679,6 +703,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -730,6 +755,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -741,6 +767,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -783,6 +810,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -794,6 +822,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -842,6 +871,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -global floppy.drive=none0 -device dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -909,6 +939,7 @@ Testing: -device floppy dev: floppy, id "" unit = 0 (0x0) drive = "" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -933,6 +964,7 @@ Testing: -device floppy,drive-type=120 dev: floppy, id "" unit = 0 (0x0) drive = "" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -957,6 +989,7 @@ Testing: -device floppy,drive-type=144 dev: floppy, id "" unit = 0 (0x0) drive = "" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -981,6 +1014,7 @@ Testing: -device floppy,drive-type=288 dev: floppy, id "" unit = 0 (0x0) drive = "" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -1008,6 +1042,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,drive-t dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -1045,6 +1080,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,drive-t dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -1085,6 +1121,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,logical dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -1122,6 +1159,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,physica dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) diff --git a/tests/qtest/rtas-test.c b/tests/qtest/rtas-test.c index 16751db..5f1194a 100644 --- a/tests/qtest/rtas-test.c +++ b/tests/qtest/rtas-test.c @@ -5,7 +5,7 @@ #include "libqos/libqos-spapr.h" #include "libqos/rtas.h" -static void test_rtas_get_time_of_day(void) +static void run_test_rtas_get_time_of_day(const char *machine) { QOSState *qs; struct tm tm; @@ -13,7 +13,7 @@ static void test_rtas_get_time_of_day(void) uint64_t ret; time_t t1, t2; - qs = qtest_spapr_boot("-machine pseries"); + qs = qtest_spapr_boot(machine); t1 = time(NULL); ret = qrtas_get_time_of_day(qs->qts, &qs->alloc, &tm, &ns); @@ -24,6 +24,16 @@ static void test_rtas_get_time_of_day(void) qtest_shutdown(qs); } +static void test_rtas_get_time_of_day(void) +{ + run_test_rtas_get_time_of_day("-machine pseries"); +} + +static void test_rtas_get_time_of_day_vof(void) +{ + run_test_rtas_get_time_of_day("-machine pseries,x-vof=on"); +} + int main(int argc, char *argv[]) { const char *arch = qtest_get_arch(); @@ -35,6 +45,7 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } qtest_add_func("rtas/get-time-of-day", test_rtas_get_time_of_day); + qtest_add_func("rtas/get-time-of-day-vof", test_rtas_get_time_of_day_vof); return g_test_run(); } diff --git a/tests/unit/ptimer-test-stubs.c b/tests/unit/ptimer-test-stubs.c index 7f801a4..2a3ef58 100644 --- a/tests/unit/ptimer-test-stubs.c +++ b/tests/unit/ptimer-test-stubs.c @@ -108,7 +108,7 @@ int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask) return deadline; } -QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque) +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name) { QEMUBH *bh = g_new(QEMUBH, 1); diff --git a/util/async.c b/util/async.c index 5d9b7cc..9a41591 100644 --- a/util/async.c +++ b/util/async.c @@ -57,6 +57,7 @@ enum { struct QEMUBH { AioContext *ctx; + const char *name; QEMUBHFunc *cb; void *opaque; QSLIST_ENTRY(QEMUBH) next; @@ -107,7 +108,8 @@ static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags) return bh; } -void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque) +void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, + void *opaque, const char *name) { QEMUBH *bh; bh = g_new(QEMUBH, 1); @@ -115,11 +117,13 @@ void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque) .ctx = ctx, .cb = cb, .opaque = opaque, + .name = name, }; aio_bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT); } -QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque) +QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, + const char *name) { QEMUBH *bh; bh = g_new(QEMUBH, 1); @@ -127,6 +131,7 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque) .ctx = ctx, .cb = cb, .opaque = opaque, + .name = name, }; return bh; } @@ -339,8 +344,20 @@ aio_ctx_finalize(GSource *source) assert(QSIMPLEQ_EMPTY(&ctx->bh_slice_list)); while ((bh = aio_bh_dequeue(&ctx->bh_list, &flags))) { - /* qemu_bh_delete() must have been called on BHs in this AioContext */ - assert(flags & BH_DELETED); + /* + * qemu_bh_delete() must have been called on BHs in this AioContext. In + * many cases memory leaks, hangs, or inconsistent state occur when a + * BH is leaked because something still expects it to run. + * + * If you hit this, fix the lifecycle of the BH so that + * qemu_bh_delete() and any associated cleanup is called before the + * AioContext is finalized. + */ + if (unlikely(!(flags & BH_DELETED))) { + fprintf(stderr, "%s: BH '%s' leaked, aborting...\n", + __func__, bh->name); + abort(); + } g_free(bh); } diff --git a/util/main-loop.c b/util/main-loop.c index 4ae5b23..06b18b1 100644 --- a/util/main-loop.c +++ b/util/main-loop.c @@ -544,9 +544,9 @@ void main_loop_wait(int nonblocking) /* Functions to operate on the main QEMU AioContext. */ -QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque) +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name) { - return aio_bh_new(qemu_aio_context, cb, opaque); + return aio_bh_new_full(qemu_aio_context, cb, opaque, name); } /* diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index 838e286..893d864 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -225,6 +225,8 @@ static void *mmap_activate(void *ptr, size_t size, int fd, "crash.\n", file_name); g_free(proc_link); g_free(file_name); + warn_report("Using non DAX backing file with 'pmem=on' option" + " is deprecated"); } /* * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try |