diff options
Diffstat (limited to 'hw')
42 files changed, 1047 insertions, 423 deletions
diff --git a/hw/char/sh_serial.c b/hw/char/sh_serial.c index 6abd803..30447fa 100644 --- a/hw/char/sh_serial.c +++ b/hw/char/sh_serial.c @@ -78,10 +78,6 @@ struct SHSerialState { qemu_irq bri; }; -typedef struct {} SHSerialStateClass; - -OBJECT_DEFINE_TYPE(SHSerialState, sh_serial, SH_SERIAL, SYS_BUS_DEVICE) - static void sh_serial_clear_fifo(SHSerialState *s) { memset(s->rx_fifo, 0, SH_RX_FIFO_LENGTH); @@ -434,17 +430,13 @@ static void sh_serial_realize(DeviceState *d, Error **errp) s->etu = NANOSECONDS_PER_SECOND / 9600; } -static void sh_serial_finalize(Object *obj) +static void sh_serial_unrealize(DeviceState *dev) { - SHSerialState *s = SH_SERIAL(obj); + SHSerialState *s = SH_SERIAL(dev); timer_del(&s->fifo_timeout_timer); } -static void sh_serial_init(Object *obj) -{ -} - static const Property sh_serial_properties[] = { DEFINE_PROP_CHR("chardev", SHSerialState, chr), DEFINE_PROP_UINT8("features", SHSerialState, feat, 0), @@ -456,7 +448,19 @@ static void sh_serial_class_init(ObjectClass *oc, const void *data) device_class_set_props(dc, sh_serial_properties); dc->realize = sh_serial_realize; + dc->unrealize = sh_serial_unrealize; device_class_set_legacy_reset(dc, sh_serial_reset); /* Reason: part of SuperH CPU/SoC, needs to be wired up */ dc->user_creatable = false; } + +static const TypeInfo sh_serial_types[] = { + { + .name = TYPE_SH_SERIAL, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(SHSerialState), + .class_init = sh_serial_class_init, + }, +}; + +DEFINE_TYPES(sh_serial_types) diff --git a/hw/display/apple-gfx.m b/hw/display/apple-gfx.m index 8dde1f1..174d56a 100644 --- a/hw/display/apple-gfx.m +++ b/hw/display/apple-gfx.m @@ -454,7 +454,7 @@ static void set_cursor_glyph(void *opaque) /* ------ DMA (device reading system memory) ------ */ typedef struct AppleGFXReadMemoryJob { - QemuSemaphore sem; + QemuEvent event; hwaddr physical_address; uint64_t length; void *dst; @@ -470,7 +470,7 @@ static void apple_gfx_do_read_memory(void *opaque) job->dst, job->length, MEMTXATTRS_UNSPECIFIED); job->success = (r == MEMTX_OK); - qemu_sem_post(&job->sem); + qemu_event_set(&job->event); } static bool apple_gfx_read_memory(AppleGFXState *s, hwaddr physical_address, @@ -483,11 +483,11 @@ static bool apple_gfx_read_memory(AppleGFXState *s, hwaddr physical_address, trace_apple_gfx_read_memory(physical_address, length, dst); /* Performing DMA requires BQL, so do it in a BH. */ - qemu_sem_init(&job.sem, 0); + qemu_event_init(&job.event, 0); aio_bh_schedule_oneshot(qemu_get_aio_context(), apple_gfx_do_read_memory, &job); - qemu_sem_wait(&job.sem); - qemu_sem_destroy(&job.sem); + qemu_event_wait(&job.event); + qemu_event_destroy(&job.event); return job.success; } diff --git a/hw/gpio/pca9552.c b/hw/gpio/pca9552.c index d65c0a2..1e10238 100644 --- a/hw/gpio/pca9552.c +++ b/hw/gpio/pca9552.c @@ -76,7 +76,7 @@ static void pca955x_display_pins_status(PCA955xState *s, return; } if (trace_event_get_state_backends(TRACE_PCA955X_GPIO_STATUS)) { - char *buf = g_newa(char, k->pin_count + 1); + char buf[PCA955X_PIN_COUNT_MAX + 1]; for (i = 0; i < k->pin_count; i++) { if (extract32(pins_status, i, 1)) { diff --git a/hw/hyperv/hv-balloon.c b/hw/hyperv/hv-balloon.c index 94b0abb..6dbcb2d 100644 --- a/hw/hyperv/hv-balloon.c +++ b/hw/hyperv/hv-balloon.c @@ -67,10 +67,6 @@ * these requests */ -struct HvBalloonClass { - VMBusDeviceClass parent_class; -} HvBalloonClass; - typedef enum State { /* not a real state */ S_NO_CHANGE = 0, @@ -162,8 +158,9 @@ typedef struct HvBalloon { MemoryRegion *mr; } HvBalloon; -OBJECT_DEFINE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, HV_BALLOON, VMBUS_DEVICE, \ - { TYPE_MEMORY_DEVICE }, { }) +OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, \ + HV_BALLOON, VMBUS_DEVICE, \ + { TYPE_MEMORY_DEVICE }, { }) #define HV_BALLOON_SET_STATE(hvb, news) \ do { \ diff --git a/hw/intc/loongarch_pch_pic.c b/hw/intc/loongarch_pch_pic.c index cbba2fc..ebb33ed 100644 --- a/hw/intc/loongarch_pch_pic.c +++ b/hw/intc/loongarch_pch_pic.c @@ -82,7 +82,7 @@ static uint64_t pch_pic_read(void *opaque, hwaddr addr, uint64_t field_mask) addr -= offset; switch (addr) { case PCH_PIC_INT_ID: - val = s->id.data; + val = cpu_to_le64(s->id.data); break; case PCH_PIC_INT_MASK: val = s->int_mask; diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index 9b6292e..14d6c52 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -35,12 +35,6 @@ struct loongarch_linux_hdr { uint32_t pe_header_offset; } QEMU_PACKED; -struct memmap_entry *memmap_table; -unsigned memmap_entries; - -ram_addr_t initrd_offset; -uint64_t initrd_size; - static const unsigned int slave_boot_code[] = { /* Configure reset ebase. */ 0x0400302c, /* csrwr $t0, LOONGARCH_CSR_EENTRY */ @@ -94,12 +88,16 @@ static inline void *guidcpy(void *dst, const void *src) return memcpy(dst, src, sizeof(efi_guid_t)); } -static void init_efi_boot_memmap(struct efi_system_table *systab, +static void init_efi_boot_memmap(MachineState *ms, + struct efi_system_table *systab, void *p, void *start) { unsigned i; struct efi_boot_memmap *boot_memmap = p; efi_guid_t tbl_guid = LINUX_EFI_BOOT_MEMMAP_GUID; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms); + struct memmap_entry *memmap_table; + unsigned int memmap_entries; /* efi_configuration_table 1 */ guidcpy(&systab->tables[0].guid, &tbl_guid); @@ -111,6 +109,8 @@ static void init_efi_boot_memmap(struct efi_system_table *systab, boot_memmap->map_size = 0; efi_memory_desc_t *map = p + sizeof(struct efi_boot_memmap); + memmap_table = lvms->memmap_table; + memmap_entries = lvms->memmap_entries; for (i = 0; i < memmap_entries; i++) { map = (void *)boot_memmap + sizeof(*map); map[i].type = memmap_table[i].type; @@ -121,7 +121,8 @@ static void init_efi_boot_memmap(struct efi_system_table *systab, } } -static void init_efi_initrd_table(struct efi_system_table *systab, +static void init_efi_initrd_table(struct loongarch_boot_info *info, + struct efi_system_table *systab, void *p, void *start) { efi_guid_t tbl_guid = LINUX_EFI_INITRD_MEDIA_GUID; @@ -132,8 +133,8 @@ static void init_efi_initrd_table(struct efi_system_table *systab, systab->tables[1].table = (struct efi_configuration_table *)(p - start); systab->nr_tables = 2; - initrd_table->base = initrd_offset; - initrd_table->size = initrd_size; + initrd_table->base = info->initrd_addr; + initrd_table->size = info->initrd_size; } static void init_efi_fdt_table(struct efi_system_table *systab) @@ -146,10 +147,12 @@ static void init_efi_fdt_table(struct efi_system_table *systab) systab->nr_tables = 3; } -static void init_systab(struct loongarch_boot_info *info, void *p, void *start) +static void init_systab(MachineState *ms, + struct loongarch_boot_info *info, void *p, void *start) { void *bp_tables_start; struct efi_system_table *systab = p; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms); info->a2 = p - start; @@ -166,10 +169,10 @@ static void init_systab(struct loongarch_boot_info *info, void *p, void *start) systab->tables = p; bp_tables_start = p; - init_efi_boot_memmap(systab, p, start); + init_efi_boot_memmap(ms, systab, p, start); p += ROUND_UP(sizeof(struct efi_boot_memmap) + - sizeof(efi_memory_desc_t) * memmap_entries, 64 * KiB); - init_efi_initrd_table(systab, p, start); + sizeof(efi_memory_desc_t) * lvms->memmap_entries, 64 * KiB); + init_efi_initrd_table(info, systab, p, start); p += ROUND_UP(sizeof(struct efi_initrd), 64 * KiB); init_efi_fdt_table(systab); @@ -276,8 +279,8 @@ static ram_addr_t alloc_initrd_memory(struct loongarch_boot_info *info, static int64_t load_kernel_info(struct loongarch_boot_info *info) { - uint64_t kernel_entry, kernel_low, kernel_high; - ssize_t kernel_size; + uint64_t kernel_entry, kernel_low, kernel_high, initrd_offset = 0; + ssize_t kernel_size, initrd_size; kernel_size = load_elf(info->kernel_filename, NULL, cpu_loongarch_virt_to_phys, NULL, @@ -313,8 +316,9 @@ static int64_t load_kernel_info(struct loongarch_boot_info *info) info->initrd_filename); exit(1); } - } else { - initrd_size = 0; + + info->initrd_addr = initrd_offset; + info->initrd_size = initrd_size; } return kernel_entry; @@ -369,17 +373,19 @@ static void loongarch_firmware_boot(LoongArchVirtMachineState *lvms, fw_cfg_add_kernel_info(info, lvms->fw_cfg); } -static void init_boot_rom(struct loongarch_boot_info *info, void *p) +static void init_boot_rom(MachineState *ms, + struct loongarch_boot_info *info, void *p) { void *start = p; init_cmdline(info, p, start); p += COMMAND_LINE_SIZE; - init_systab(info, p, start); + init_systab(ms, info, p, start); } -static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) +static void loongarch_direct_kernel_boot(MachineState *ms, + struct loongarch_boot_info *info) { void *p, *bp; int64_t kernel_addr = VIRT_FLASH0_BASE; @@ -397,7 +403,7 @@ static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) /* Load cmdline and system tables at [0 - 1 MiB] */ p = g_malloc0(1 * MiB); bp = p; - init_boot_rom(info, p); + init_boot_rom(ms, info, p); rom_add_blob_fixed_as("boot_info", bp, 1 * MiB, 0, &address_space_memory); /* Load slave boot code at pflash0 . */ @@ -437,6 +443,6 @@ void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info) if (lvms->bios_loaded) { loongarch_firmware_boot(lvms, info); } else { - loongarch_direct_kernel_boot(info); + loongarch_direct_kernel_boot(ms, info); } } diff --git a/hw/loongarch/virt-acpi-build.c b/hw/loongarch/virt-acpi-build.c index 073b6de..2cd2d9d 100644 --- a/hw/loongarch/virt-acpi-build.c +++ b/hw/loongarch/virt-acpi-build.c @@ -575,8 +575,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) acpi_add_table(table_offsets, tables_blob); { AcpiMcfgInfo mcfg = { - .base = cpu_to_le64(VIRT_PCI_CFG_BASE), - .size = cpu_to_le64(VIRT_PCI_CFG_SIZE), + .base = VIRT_PCI_CFG_BASE, + .size = VIRT_PCI_CFG_SIZE, }; build_mcfg(tables_blob, tables->linker, &mcfg, lvms->oem_id, lvms->oem_table_id); diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 1b50404..34dfbd1 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -136,6 +136,10 @@ static void virt_build_smbios(LoongArchVirtMachineState *lvms) return; } + if (kvm_enabled()) { + product = "KVM Virtual Machine"; + } + smbios_set_defaults("QEMU", product, mc->name); smbios_get_tables(ms, SMBIOS_ENTRY_POINT_TYPE_64, @@ -168,8 +172,15 @@ static void virt_powerdown_req(Notifier *notifier, void *opaque) acpi_send_event(s->acpi_ged, ACPI_POWER_DOWN_STATUS); } -static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) +static void memmap_add_entry(MachineState *ms, uint64_t address, + uint64_t length, uint32_t type) { + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms); + struct memmap_entry *memmap_table; + unsigned int memmap_entries; + + memmap_table = lvms->memmap_table; + memmap_entries = lvms->memmap_entries; /* Ensure there are no duplicate entries. */ for (unsigned i = 0; i < memmap_entries; i++) { assert(memmap_table[i].address != address); @@ -182,6 +193,8 @@ static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) memmap_table[memmap_entries].type = cpu_to_le32(type); memmap_table[memmap_entries].reserved = 0; memmap_entries++; + lvms->memmap_table = memmap_table; + lvms->memmap_entries = memmap_entries; } static DeviceState *create_acpi_ged(DeviceState *pch_pic, @@ -619,13 +632,13 @@ static void fw_cfg_add_memory(MachineState *ms) } if (size >= gap) { - memmap_add_entry(base, gap, 1); + memmap_add_entry(ms, base, gap, 1); size -= gap; base = VIRT_HIGHMEM_BASE; } if (size) { - memmap_add_entry(base, size, 1); + memmap_add_entry(ms, base, size, 1); base += size; } @@ -640,7 +653,7 @@ static void fw_cfg_add_memory(MachineState *ms) * lowram: [base, +(gap - numa_info[0].node_mem)) * highram: [VIRT_HIGHMEM_BASE, +(ram_size - gap)) */ - memmap_add_entry(base, gap - numa_info[0].node_mem, 1); + memmap_add_entry(ms, base, gap - numa_info[0].node_mem, 1); size = ram_size - gap; base = VIRT_HIGHMEM_BASE; } else { @@ -648,7 +661,7 @@ static void fw_cfg_add_memory(MachineState *ms) } if (size) { - memmap_add_entry(base, size, 1); + memmap_add_entry(ms, base, size, 1); } } @@ -734,8 +747,8 @@ static void virt_init(MachineState *machine) rom_set_fw(lvms->fw_cfg); if (lvms->fw_cfg != NULL) { fw_cfg_add_file(lvms->fw_cfg, "etc/memmap", - memmap_table, - sizeof(struct memmap_entry) * (memmap_entries)); + lvms->memmap_table, + sizeof(struct memmap_entry) * lvms->memmap_entries); } /* Initialize the IO interrupt subsystem */ diff --git a/hw/misc/stm32_rcc.c b/hw/misc/stm32_rcc.c index 94e8dae..5815b3e 100644 --- a/hw/misc/stm32_rcc.c +++ b/hw/misc/stm32_rcc.c @@ -60,7 +60,7 @@ static void stm32_rcc_write(void *opaque, hwaddr addr, uint32_t value = val64; uint32_t prev_value, new_value, irq_offset; - trace_stm32_rcc_write(value, addr); + trace_stm32_rcc_write(addr, value); if (addr > STM32_RCC_DCKCFGR2) { qemu_log_mask(LOG_GUEST_ERROR, "%s: Bad offset 0x%"HWADDR_PRIx"\n", diff --git a/hw/net/fsl_etsec/etsec.c b/hw/net/fsl_etsec/etsec.c index d14cb2a..846f6cb 100644 --- a/hw/net/fsl_etsec/etsec.c +++ b/hw/net/fsl_etsec/etsec.c @@ -389,6 +389,7 @@ static void etsec_realize(DeviceState *dev, Error **errp) { eTSEC *etsec = ETSEC_COMMON(dev); + qemu_macaddr_default_if_unset(&etsec->conf.macaddr); etsec->nic = qemu_new_nic(&net_etsec_info, &etsec->conf, object_get_typename(OBJECT(dev)), dev->id, &dev->mem_reentrancy_guard, etsec); diff --git a/hw/net/i82596.c b/hw/net/i82596.c index 64ed3c8..c1ff3e6 100644 --- a/hw/net/i82596.c +++ b/hw/net/i82596.c @@ -5,7 +5,7 @@ * This work is licensed under the GNU GPL license version 2 or later. * * This software was written to be compatible with the specification: - * https://www.intel.com/assets/pdf/general/82596ca.pdf + * https://parisc.docs.kernel.org/en/latest/_downloads/96672be0650d9fc046bbcea40b92482f/82596CA.pdf */ #include "qemu/osdep.h" @@ -177,6 +177,26 @@ static void set_individual_address(I82596State *s, uint32_t addr) trace_i82596_new_mac(nc->info_str); } +static void i82596_configure(I82596State *s, uint32_t addr) +{ + uint8_t byte_cnt; + byte_cnt = get_byte(addr + 8) & 0x0f; + + byte_cnt = MAX(byte_cnt, 4); + byte_cnt = MIN(byte_cnt, sizeof(s->config)); + /* copy byte_cnt max. */ + address_space_read(&address_space_memory, addr + 8, + MEMTXATTRS_UNSPECIFIED, s->config, byte_cnt); + /* config byte according to page 35ff */ + s->config[2] &= 0x82; /* mask valid bits */ + s->config[2] |= 0x40; + s->config[7] &= 0xf7; /* clear zero bit */ + assert(I596_NOCRC_INS == 0); /* do CRC insertion */ + s->config[10] = MAX(s->config[10], 5); /* min frame length */ + s->config[12] &= 0x40; /* only full duplex field valid */ + s->config[13] |= 0x3f; /* set ones in byte 13 */ +} + static void set_multicast_list(I82596State *s, uint32_t addr) { uint16_t mc_count, i; @@ -234,7 +254,6 @@ static void command_loop(I82596State *s) { uint16_t cmd; uint16_t status; - uint8_t byte_cnt; DBG(printf("STARTING COMMAND LOOP cmd_p=%08x\n", s->cmd_p)); @@ -254,20 +273,7 @@ static void command_loop(I82596State *s) set_individual_address(s, s->cmd_p); break; case CmdConfigure: - byte_cnt = get_byte(s->cmd_p + 8) & 0x0f; - byte_cnt = MAX(byte_cnt, 4); - byte_cnt = MIN(byte_cnt, sizeof(s->config)); - /* copy byte_cnt max. */ - address_space_read(&address_space_memory, s->cmd_p + 8, - MEMTXATTRS_UNSPECIFIED, s->config, byte_cnt); - /* config byte according to page 35ff */ - s->config[2] &= 0x82; /* mask valid bits */ - s->config[2] |= 0x40; - s->config[7] &= 0xf7; /* clear zero bit */ - assert(I596_NOCRC_INS == 0); /* do CRC insertion */ - s->config[10] = MAX(s->config[10], 5); /* min frame length */ - s->config[12] &= 0x40; /* only full duplex field valid */ - s->config[13] |= 0x3f; /* set ones in byte 13 */ + i82596_configure(s, s->cmd_p); break; case CmdTDR: /* get signal LINK */ diff --git a/hw/pci-host/ppce500.c b/hw/pci-host/ppce500.c index e97a515..52269b0 100644 --- a/hw/pci-host/ppce500.c +++ b/hw/pci-host/ppce500.c @@ -16,7 +16,6 @@ #include "qemu/osdep.h" #include "hw/irq.h" -#include "hw/ppc/e500-ccsr.h" #include "hw/qdev-properties.h" #include "migration/vmstate.h" #include "hw/pci/pci_device.h" @@ -418,11 +417,12 @@ static const VMStateDescription vmstate_ppce500_pci = { static void e500_pcihost_bridge_realize(PCIDevice *d, Error **errp) { PPCE500PCIBridgeState *b = PPC_E500_PCI_BRIDGE(d); - PPCE500CCSRState *ccsr = CCSR( + SysBusDevice *ccsr = SYS_BUS_DEVICE( object_resolve_path_component(qdev_get_machine(), "e500-ccsr")); + MemoryRegion *ccsr_space = sysbus_mmio_get_region(ccsr, 0); - memory_region_init_alias(&b->bar0, OBJECT(ccsr), "e500-pci-bar0", &ccsr->ccsr_space, - 0, int128_get64(ccsr->ccsr_space.size)); + memory_region_init_alias(&b->bar0, OBJECT(ccsr), "e500-pci-bar0", + ccsr_space, 0, int128_get64(ccsr_space->size)); pci_register_bar(d, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &b->bar0); } diff --git a/hw/pci-host/raven.c b/hw/pci-host/raven.c index 21f7ca6..f8c0be5 100644 --- a/hw/pci-host/raven.c +++ b/hw/pci-host/raven.c @@ -24,7 +24,6 @@ */ #include "qemu/osdep.h" -#include "qemu/datadir.h" #include "qemu/units.h" #include "qemu/log.h" #include "qapi/error.h" @@ -35,9 +34,7 @@ #include "migration/vmstate.h" #include "hw/intc/i8259.h" #include "hw/irq.h" -#include "hw/loader.h" #include "hw/or-irq.h" -#include "elf.h" #include "qom/object.h" #define TYPE_RAVEN_PCI_DEVICE "raven" @@ -47,10 +44,6 @@ OBJECT_DECLARE_SIMPLE_TYPE(RavenPCIState, RAVEN_PCI_DEVICE) struct RavenPCIState { PCIDevice dev; - - uint32_t elf_machine; - char *bios_name; - MemoryRegion bios; }; typedef struct PRePPCIState PREPPCIState; @@ -75,11 +68,8 @@ struct PRePPCIState { RavenPCIState pci_dev; int contiguous_map; - bool is_legacy_prep; }; -#define BIOS_SIZE (1 * MiB) - #define PCI_IO_BASE_ADDR 0x80000000 /* Physical address on main bus */ static inline uint32_t raven_pci_io_config(hwaddr addr) @@ -243,22 +233,18 @@ static void raven_pcihost_realizefn(DeviceState *d, Error **errp) MemoryRegion *address_space_mem = get_system_memory(); int i; - if (s->is_legacy_prep) { - for (i = 0; i < PCI_NUM_PINS; i++) { - sysbus_init_irq(dev, &s->pci_irqs[i]); - } - } else { - /* According to PReP specification section 6.1.6 "System Interrupt - * Assignments", all PCI interrupts are routed via IRQ 15 */ - s->or_irq = OR_IRQ(object_new(TYPE_OR_IRQ)); - object_property_set_int(OBJECT(s->or_irq), "num-lines", PCI_NUM_PINS, - &error_fatal); - qdev_realize(DEVICE(s->or_irq), NULL, &error_fatal); - sysbus_init_irq(dev, &s->or_irq->out_irq); - - for (i = 0; i < PCI_NUM_PINS; i++) { - s->pci_irqs[i] = qdev_get_gpio_in(DEVICE(s->or_irq), i); - } + /* + * According to PReP specification section 6.1.6 "System Interrupt + * Assignments", all PCI interrupts are routed via IRQ 15 + */ + s->or_irq = OR_IRQ(object_new(TYPE_OR_IRQ)); + object_property_set_int(OBJECT(s->or_irq), "num-lines", PCI_NUM_PINS, + &error_fatal); + qdev_realize(DEVICE(s->or_irq), NULL, &error_fatal); + sysbus_init_irq(dev, &s->or_irq->out_irq); + + for (i = 0; i < PCI_NUM_PINS; i++) { + s->pci_irqs[i] = qdev_get_gpio_in(DEVICE(s->or_irq), i); } qdev_init_gpio_in(d, raven_change_gpio, 1); @@ -338,48 +324,9 @@ static void raven_pcihost_initfn(Object *obj) static void raven_realize(PCIDevice *d, Error **errp) { - RavenPCIState *s = RAVEN_PCI_DEVICE(d); - char *filename; - int bios_size = -1; - d->config[PCI_CACHE_LINE_SIZE] = 0x08; d->config[PCI_LATENCY_TIMER] = 0x10; d->config[PCI_CAPABILITY_LIST] = 0x00; - - if (!memory_region_init_rom_nomigrate(&s->bios, OBJECT(s), "bios", - BIOS_SIZE, errp)) { - return; - } - memory_region_add_subregion(get_system_memory(), (uint32_t)(-BIOS_SIZE), - &s->bios); - if (s->bios_name) { - filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, s->bios_name); - if (filename) { - if (s->elf_machine != EM_NONE) { - bios_size = load_elf(filename, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, - ELFDATA2MSB, s->elf_machine, 0, 0); - } - if (bios_size < 0) { - bios_size = get_image_size(filename); - if (bios_size > 0 && bios_size <= BIOS_SIZE) { - hwaddr bios_addr; - bios_size = (bios_size + 0xfff) & ~0xfff; - bios_addr = (uint32_t)(-BIOS_SIZE); - bios_size = load_image_targphys(filename, bios_addr, - bios_size); - } - } - } - g_free(filename); - if (bios_size < 0 || bios_size > BIOS_SIZE) { - memory_region_del_subregion(get_system_memory(), &s->bios); - error_setg(errp, "Could not load bios image '%s'", s->bios_name); - return; - } - } - - vmstate_register_ram_global(&s->bios); } static const VMStateDescription vmstate_raven = { @@ -422,22 +369,12 @@ static const TypeInfo raven_info = { }, }; -static const Property raven_pcihost_properties[] = { - DEFINE_PROP_UINT32("elf-machine", PREPPCIState, pci_dev.elf_machine, - EM_NONE), - DEFINE_PROP_STRING("bios-name", PREPPCIState, pci_dev.bios_name), - /* Temporary workaround until legacy prep machine is removed */ - DEFINE_PROP_BOOL("is-legacy-prep", PREPPCIState, is_legacy_prep, - false), -}; - static void raven_pcihost_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); dc->realize = raven_pcihost_realizefn; - device_class_set_props(dc, raven_pcihost_properties); dc->fw_name = "pci"; } diff --git a/hw/pci/msix.c b/hw/pci/msix.c index 66f27b9..8c7f670 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -72,7 +72,7 @@ static uint8_t *msix_pending_byte(PCIDevice *dev, int vector) return dev->msix_pba + vector / 8; } -static int msix_is_pending(PCIDevice *dev, int vector) +int msix_is_pending(PCIDevice *dev, unsigned int vector) { return *msix_pending_byte(dev, vector) & msix_pending_mask(vector); } diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 9b4bf48..c70b5ce 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -32,6 +32,7 @@ #include "hw/pci/pci_host.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "migration/cpr.h" #include "migration/qemu-file-types.h" #include "migration/vmstate.h" #include "net/net.h" @@ -537,6 +538,10 @@ static void pci_reset_regions(PCIDevice *dev) static void pci_do_device_reset(PCIDevice *dev) { + if ((dev->cap_present & QEMU_PCI_SKIP_RESET_ON_CPR) && cpr_is_incoming()) { + return; + } + pci_device_deassert_intx(dev); assert(dev->irq_state == 0); diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index 809078a..6899802 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -79,8 +79,6 @@ #define MPC85XX_ESDHC_IRQ 72 #define RTC_REGS_OFFSET 0x68 -#define PLATFORM_CLK_FREQ_HZ (400 * 1000 * 1000) - struct boot_info { uint32_t dt_base; @@ -120,7 +118,7 @@ static uint32_t *pci_map_create(void *fdt, uint32_t mpic, int first_slot, } static void dt_serial_create(void *fdt, unsigned long long offset, - const char *soc, const char *mpic, + const char *soc, uint32_t freq, const char *mpic, const char *alias, int idx, bool defcon) { char *ser; @@ -131,7 +129,7 @@ static void dt_serial_create(void *fdt, unsigned long long offset, qemu_fdt_setprop_string(fdt, ser, "compatible", "ns16550"); qemu_fdt_setprop_cells(fdt, ser, "reg", offset, 0x100); qemu_fdt_setprop_cell(fdt, ser, "cell-index", idx); - qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", PLATFORM_CLK_FREQ_HZ); + qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", freq); qemu_fdt_setprop_cells(fdt, ser, "interrupts", 42, 2); qemu_fdt_setprop_phandle(fdt, ser, "interrupt-parent", mpic); qemu_fdt_setprop_string(fdt, "/aliases", alias, ser); @@ -382,8 +380,7 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms, int fdt_size; void *fdt; uint8_t hypercall[16]; - uint32_t clock_freq = PLATFORM_CLK_FREQ_HZ; - uint32_t tb_freq = PLATFORM_CLK_FREQ_HZ; + uint32_t clock_freq, tb_freq; int i; char compatible_sb[] = "fsl,mpc8544-immr\0simple-bus"; char *soc; @@ -484,6 +481,9 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms, if (kvmppc_get_hasidle(env)) { qemu_fdt_setprop(fdt, "/hypervisor", "has-idle", NULL, 0); } + } else { + clock_freq = pmc->clock_freq; + tb_freq = pmc->tb_freq; } /* Create CPU nodes */ @@ -564,12 +564,12 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms, */ if (serial_hd(1)) { dt_serial_create(fdt, MPC8544_SERIAL1_REGS_OFFSET, - soc, mpic, "serial1", 1, false); + soc, pmc->clock_freq, mpic, "serial1", 1, false); } if (serial_hd(0)) { dt_serial_create(fdt, MPC8544_SERIAL0_REGS_OFFSET, - soc, mpic, "serial0", 0, true); + soc, pmc->clock_freq, mpic, "serial0", 0, true); } /* i2c */ @@ -931,7 +931,6 @@ void ppce500_init(MachineState *machine) CPUPPCState *firstenv = NULL; MemoryRegion *ccsr_addr_space; SysBusDevice *s; - PPCE500CCSRState *ccsr; I2CBus *i2c; irqs = g_new0(IrqLines, smp_cpus); @@ -968,7 +967,7 @@ void ppce500_init(MachineState *machine) env->spr_cb[SPR_BOOKE_PIR].default_value = cs->cpu_index = i; env->mpic_iack = pmc->ccsrbar_base + MPC8544_MPIC_REGS_OFFSET + 0xa0; - ppc_booke_timers_init(cpu, PLATFORM_CLK_FREQ_HZ, PPC_TIMER_E500); + ppc_booke_timers_init(cpu, pmc->tb_freq, PPC_TIMER_E500); /* Register reset handler */ if (!i) { @@ -993,10 +992,10 @@ void ppce500_init(MachineState *machine) memory_region_add_subregion(address_space_mem, 0, machine->ram); dev = qdev_new("e500-ccsr"); + s = SYS_BUS_DEVICE(dev); object_property_add_child(OBJECT(machine), "e500-ccsr", OBJECT(dev)); - sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); - ccsr = CCSR(dev); - ccsr_addr_space = &ccsr->ccsr_space; + sysbus_realize_and_unref(s, &error_fatal); + ccsr_addr_space = sysbus_mmio_get_region(s, 0); memory_region_add_subregion(address_space_mem, pmc->ccsrbar_base, ccsr_addr_space); @@ -1284,6 +1283,7 @@ static void e500_ccsr_initfn(Object *obj) PPCE500CCSRState *ccsr = CCSR(obj); memory_region_init(&ccsr->ccsr_space, obj, "e500-ccsr", MPC8544_CCSRBAR_SIZE); + sysbus_init_mmio(SYS_BUS_DEVICE(ccsr), &ccsr->ccsr_space); } static const TypeInfo e500_ccsr_info = { diff --git a/hw/ppc/e500.h b/hw/ppc/e500.h index 01db102..00f4905 100644 --- a/hw/ppc/e500.h +++ b/hw/ppc/e500.h @@ -5,6 +5,8 @@ #include "hw/platform-bus.h" #include "qom/object.h" +#define PLATFORM_CLK_FREQ_HZ (400 * 1000 * 1000) + struct PPCE500MachineState { /*< private >*/ MachineState parent_obj; @@ -37,6 +39,8 @@ struct PPCE500MachineClass { hwaddr pci_mmio_base; hwaddr pci_mmio_bus_base; hwaddr spin_base; + uint32_t clock_freq; + uint32_t tb_freq; }; void ppce500_init(MachineState *machine); diff --git a/hw/ppc/e500plat.c b/hw/ppc/e500plat.c index 775b9d8..4f1d659 100644 --- a/hw/ppc/e500plat.c +++ b/hw/ppc/e500plat.c @@ -93,6 +93,8 @@ static void e500plat_machine_class_init(ObjectClass *oc, const void *data) pmc->pci_mmio_base = 0xC00000000ULL; pmc->pci_mmio_bus_base = 0xE0000000ULL; pmc->spin_base = 0xFEF000000ULL; + pmc->clock_freq = PLATFORM_CLK_FREQ_HZ; + pmc->tb_freq = PLATFORM_CLK_FREQ_HZ; mc->desc = "generic paravirt e500 platform"; mc->init = e500plat_init; diff --git a/hw/ppc/mpc8544ds.c b/hw/ppc/mpc8544ds.c index 97fb0f3..5826985 100644 --- a/hw/ppc/mpc8544ds.c +++ b/hw/ppc/mpc8544ds.c @@ -55,6 +55,8 @@ static void mpc8544ds_machine_class_init(ObjectClass *oc, const void *data) pmc->pci_mmio_bus_base = 0xC0000000ULL; pmc->pci_pio_base = 0xE1000000ULL; pmc->spin_base = 0xEF000000ULL; + pmc->clock_freq = PLATFORM_CLK_FREQ_HZ; + pmc->tb_freq = PLATFORM_CLK_FREQ_HZ; mc->desc = "mpc8544ds"; mc->init = mpc8544ds_init; diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c index 7395263..982e40e 100644 --- a/hw/ppc/prep.c +++ b/hw/ppc/prep.c @@ -35,6 +35,7 @@ #include "qapi/error.h" #include "qemu/error-report.h" #include "qemu/log.h" +#include "qemu/datadir.h" #include "hw/loader.h" #include "hw/rtc/mc146818rtc.h" #include "hw/isa/pc87312.h" @@ -55,6 +56,8 @@ #define KERNEL_LOAD_ADDR 0x01000000 #define INITRD_LOAD_ADDR 0x01800000 +#define BIOS_ADDR 0xfff00000 +#define BIOS_SIZE (1 * MiB) #define NVRAM_SIZE 0x2000 static void fw_cfg_boot_set(void *opaque, const char *boot_device, @@ -241,6 +244,9 @@ static void ibm_40p_init(MachineState *machine) ISADevice *isa_dev; ISABus *isa_bus; void *fw_cfg; + MemoryRegion *bios = g_new(MemoryRegion, 1); + char *filename; + ssize_t bios_size = -1; uint32_t kernel_base = 0, initrd_base = 0; long kernel_size = 0, initrd_size = 0; char boot_device; @@ -263,10 +269,27 @@ static void ibm_40p_init(MachineState *machine) cpu_ppc_tb_init(env, 100UL * 1000UL * 1000UL); qemu_register_reset(ppc_prep_reset, cpu); + /* allocate and load firmware */ + filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name); + if (!filename) { + error_report("Could not find bios image '%s'", bios_name); + exit(1); + } + memory_region_init_rom(bios, NULL, "bios", BIOS_SIZE, &error_fatal); + memory_region_add_subregion(get_system_memory(), BIOS_ADDR, bios); + bios_size = load_elf(filename, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + ELFDATA2MSB, PPC_ELF_MACHINE, 0, 0); + if (bios_size < 0) { + bios_size = load_image_targphys(filename, BIOS_ADDR, BIOS_SIZE); + } + if (bios_size < 0 || bios_size > BIOS_SIZE) { + error_report("Could not load bios image '%s'", filename); + return; + } + g_free(filename); + /* PCI host */ dev = qdev_new("raven-pcihost"); - qdev_prop_set_string(dev, "bios-name", bios_name); - qdev_prop_set_uint32(dev, "elf-machine", PPC_ELF_MACHINE); pcihost = SYS_BUS_DEVICE(dev); object_property_add_child(qdev_get_machine(), "raven", OBJECT(dev)); sysbus_realize_and_unref(pcihost, &error_fatal); diff --git a/hw/riscv/riscv-iommu-pci.c b/hw/riscv/riscv-iommu-pci.c index 1f44eef..cdb4a7a 100644 --- a/hw/riscv/riscv-iommu-pci.c +++ b/hw/riscv/riscv-iommu-pci.c @@ -68,12 +68,6 @@ typedef struct RISCVIOMMUStatePci { RISCVIOMMUState iommu; /* common IOMMU state */ } RISCVIOMMUStatePci; -struct RISCVIOMMUPciClass { - /*< public >*/ - DeviceRealize parent_realize; - ResettablePhases parent_phases; -}; - /* interrupt delivery callback */ static void riscv_iommu_pci_notify(RISCVIOMMUState *iommu, unsigned vector) { diff --git a/hw/riscv/riscv-iommu-sys.c b/hw/riscv/riscv-iommu-sys.c index 74e76b9..e34d00a 100644 --- a/hw/riscv/riscv-iommu-sys.c +++ b/hw/riscv/riscv-iommu-sys.c @@ -53,12 +53,6 @@ struct RISCVIOMMUStateSys { uint8_t *msix_pba; }; -struct RISCVIOMMUSysClass { - /*< public >*/ - DeviceRealize parent_realize; - ResettablePhases parent_phases; -}; - static uint64_t msix_table_mmio_read(void *opaque, hwaddr addr, unsigned size) { diff --git a/hw/s390x/ap-stub.c b/hw/s390x/ap-stub.c new file mode 100644 index 0000000..001fe5f --- /dev/null +++ b/hw/s390x/ap-stub.c @@ -0,0 +1,21 @@ +/* + * VFIO based AP matrix device assignment + * + * Copyright 2025 IBM Corp. + * Author(s): Rorie Reyes <rreyes@linux.ibm.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/s390x/ap-bridge.h" + +int ap_chsc_sei_nt0_get_event(void *res) +{ + return EVENT_INFORMATION_NOT_STORED; +} + +bool ap_chsc_sei_nt0_have_event(void) +{ + return false; +} diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build index 3bbebfd..99cbcbd 100644 --- a/hw/s390x/meson.build +++ b/hw/s390x/meson.build @@ -33,6 +33,7 @@ s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files( )) s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c')) s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c')) +s390x_ss.add(when: 'CONFIG_VFIO_AP', if_false: files('ap-stub.c')) virtio_ss = ss.source_set() virtio_ss.add(files('virtio-ccw.c')) diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index f69a4d8..ce3c13d 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -1145,20 +1145,6 @@ static void ccw_machine_4_2_class_options(MachineClass *mc) } DEFINE_CCW_MACHINE(4, 2); -static void ccw_machine_4_1_instance_options(MachineState *machine) -{ - static const S390FeatInit qemu_cpu_feat = { S390_FEAT_LIST_QEMU_V4_1 }; - ccw_machine_4_2_instance_options(machine); - s390_set_qemu_cpu_model(0x2964, 13, 2, qemu_cpu_feat); -} - -static void ccw_machine_4_1_class_options(MachineClass *mc) -{ - ccw_machine_4_2_class_options(mc); - compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len); -} -DEFINE_CCW_MACHINE(4, 1); - static void ccw_machine_register_types(void) { type_register_static(&ccw_machine_info); diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c index 0fd1337..cb48cc1 100644 --- a/hw/timer/hpet.c +++ b/hw/timer/hpet.c @@ -328,16 +328,16 @@ static const VMStateDescription vmstate_hpet_timer = { static const VMStateDescription vmstate_hpet = { .name = "hpet", .version_id = 2, - .minimum_version_id = 1, + .minimum_version_id = 2, .pre_save = hpet_pre_save, .post_load = hpet_post_load, .fields = (const VMStateField[]) { VMSTATE_UINT64(config, HPETState), VMSTATE_UINT64(isr, HPETState), VMSTATE_UINT64(hpet_counter, HPETState), - VMSTATE_UINT8_V(num_timers_save, HPETState, 2), + VMSTATE_UINT8(num_timers_save, HPETState), VMSTATE_VALIDATE("num_timers must match", hpet_validate_num_timers), - VMSTATE_STRUCT_VARRAY_UINT8(timer, HPETState, num_timers, 0, + VMSTATE_STRUCT_VARRAY_UINT8(timer, HPETState, num_timers_save, 0, vmstate_hpet_timer, HPETTimer), VMSTATE_END_OF_LIST() }, @@ -691,8 +691,14 @@ static void hpet_realize(DeviceState *dev, Error **errp) int i; HPETTimer *timer; + if (s->num_timers < HPET_MIN_TIMERS || s->num_timers > HPET_MAX_TIMERS) { + error_setg(errp, "hpet.num_timers must be between %d and %d", + HPET_MIN_TIMERS, HPET_MAX_TIMERS); + return; + } if (!s->intcap) { - warn_report("Hpet's intcap not initialized"); + error_setg(errp, "hpet.hpet-intcap not initialized"); + return; } if (hpet_fw_cfg.count == UINT8_MAX) { /* first instance */ @@ -700,7 +706,7 @@ static void hpet_realize(DeviceState *dev, Error **errp) } if (hpet_fw_cfg.count == 8) { - error_setg(errp, "Only 8 instances of HPET is allowed"); + error_setg(errp, "Only 8 instances of HPET are allowed"); return; } @@ -710,11 +716,6 @@ static void hpet_realize(DeviceState *dev, Error **errp) sysbus_init_irq(sbd, &s->irqs[i]); } - if (s->num_timers < HPET_MIN_TIMERS) { - s->num_timers = HPET_MIN_TIMERS; - } else if (s->num_timers > HPET_MAX_TIMERS) { - s->num_timers = HPET_MAX_TIMERS; - } for (i = 0; i < HPET_MAX_TIMERS; i++) { timer = &s->timer[i]; timer->qemu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, hpet_timer, timer); diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 785c0a0..874e0d1 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -10,6 +10,7 @@ * directory. */ +#include <stdbool.h> #include "qemu/osdep.h" #include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include <linux/vfio.h> @@ -18,8 +19,10 @@ #include "hw/vfio/vfio-device.h" #include "system/iommufd.h" #include "hw/s390x/ap-device.h" +#include "hw/s390x/css.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" +#include "qemu/lockable.h" #include "qemu/main-loop.h" #include "qemu/module.h" #include "qemu/option.h" @@ -37,8 +40,18 @@ struct VFIOAPDevice { APDevice apdev; VFIODevice vdev; EventNotifier req_notifier; + EventNotifier cfg_notifier; }; +typedef struct APConfigChgEvent { + QTAILQ_ENTRY(APConfigChgEvent) next; +} APConfigChgEvent; + +static QTAILQ_HEAD(, APConfigChgEvent) cfg_chg_events = + QTAILQ_HEAD_INITIALIZER(cfg_chg_events); + +static QemuMutex cfg_chg_events_lock; + OBJECT_DECLARE_SIMPLE_TYPE(VFIOAPDevice, VFIO_AP_DEVICE) static void vfio_ap_compute_needs_reset(VFIODevice *vdev) @@ -70,6 +83,57 @@ static void vfio_ap_req_notifier_handler(void *opaque) } } +static void vfio_ap_cfg_chg_notifier_handler(void *opaque) +{ + APConfigChgEvent *cfg_chg_event; + VFIOAPDevice *vapdev = opaque; + + if (!event_notifier_test_and_clear(&vapdev->cfg_notifier)) { + return; + } + + cfg_chg_event = g_new0(APConfigChgEvent, 1); + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + QTAILQ_INSERT_TAIL(&cfg_chg_events, cfg_chg_event, next); + } + + css_generate_css_crws(0); + +} + +int ap_chsc_sei_nt0_get_event(void *res) +{ + ChscSeiNt0Res *nt0_res = (ChscSeiNt0Res *)res; + APConfigChgEvent *cfg_chg_event; + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + if (QTAILQ_EMPTY(&cfg_chg_events)) { + return EVENT_INFORMATION_NOT_STORED; + } + + cfg_chg_event = QTAILQ_FIRST(&cfg_chg_events); + QTAILQ_REMOVE(&cfg_chg_events, cfg_chg_event, next); + } + + memset(nt0_res, 0, sizeof(*nt0_res)); + g_free(cfg_chg_event); + nt0_res->flags |= PENDING_EVENT_INFO_BITMASK; + nt0_res->length = sizeof(ChscSeiNt0Res); + nt0_res->code = NT0_RES_RESPONSE_CODE; + nt0_res->nt = NT0_RES_NT_DEFAULT; + nt0_res->rs = NT0_RES_RS_AP_CHANGE; + nt0_res->cc = NT0_RES_CC_AP_CHANGE; + + return EVENT_INFORMATION_STORED; +} + +bool ap_chsc_sei_nt0_have_event(void) +{ + QEMU_LOCK_GUARD(&cfg_chg_events_lock); + return !QTAILQ_EMPTY(&cfg_chg_events); +} + static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, unsigned int irq, Error **errp) { @@ -85,6 +149,10 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, notifier = &vapdev->req_notifier; fd_read = vfio_ap_req_notifier_handler; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + fd_read = vfio_ap_cfg_chg_notifier_handler; + break; default: error_setg(errp, "vfio: Unsupported device irq(%d)", irq); return false; @@ -137,6 +205,9 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev, case VFIO_AP_REQ_IRQ_INDEX: notifier = &vapdev->req_notifier; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + break; default: error_report("vfio: Unsupported device irq(%d)", irq); return; @@ -159,6 +230,13 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); VFIODevice *vbasedev = &vapdev->vdev; + static bool lock_initialized; + + if (!lock_initialized) { + qemu_mutex_init(&cfg_chg_events_lock); + lock_initialized = true; + } + if (!vfio_device_get_name(vbasedev, errp)) { return; } @@ -176,6 +254,15 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) warn_report_err(err); } + if (!vfio_ap_register_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX, &err)) + { + /* + * Report this error, but do not make it a failing condition. + * Lack of this IRQ in the host does not prevent normal operation. + */ + warn_report_err(err); + } + return; error: @@ -188,6 +275,7 @@ static void vfio_ap_unrealize(DeviceState *dev) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX); + vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX); vfio_device_detach(&vapdev->vdev); g_free(vapdev->vdev.name); } diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 1c6ca94..d834bd4 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -75,12 +75,12 @@ void vfio_address_space_insert(VFIOAddressSpace *space, int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly) + void *vaddr, bool readonly, MemoryRegion *mr) { VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); g_assert(vioc->dma_map); - return vioc->dma_map(bcontainer, iova, size, vaddr, readonly); + return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr); } int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, diff --git a/hw/vfio/container.c b/hw/vfio/container.c index a9f0dba..3e8d645 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -31,10 +31,11 @@ #include "system/reset.h" #include "trace.h" #include "qapi/error.h" +#include "migration/cpr.h" +#include "migration/blocker.h" #include "pci.h" #include "hw/vfio/vfio-container.h" #include "vfio-helpers.h" -#include "vfio-cpr.h" #include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" @@ -135,6 +136,8 @@ static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer, int ret; Error *local_err = NULL; + g_assert(!cpr_is_incoming()); + if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) { if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) && bcontainer->dirty_pages_supported) { @@ -207,7 +210,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, } static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); @@ -425,7 +429,12 @@ static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group, return NULL; } - if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { + /* + * During CPR, just set the container type and skip the ioctls, as the + * container and group are already configured in the kernel. + */ + if (!cpr_is_incoming() && + !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { return NULL; } @@ -592,6 +601,11 @@ static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group, group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); vfio_group_add_kvm_device(group); + /* + * Remember the container fd for each group, so we can attach to the same + * container after CPR. + */ + cpr_resave_fd("vfio_container_for_group", group->groupid, container->fd); return true; } @@ -601,6 +615,7 @@ static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group) group->container = NULL; vfio_group_del_kvm_device(group); vfio_ram_block_discard_disable(container, false); + cpr_delete_fd("vfio_container_for_group", group->groupid); } static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, @@ -615,17 +630,34 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, bool group_was_added = false; space = vfio_address_space_get(as); + fd = cpr_find_fd("vfio_container_for_group", group->groupid); - QLIST_FOREACH(bcontainer, &space->containers, next) { - container = container_of(bcontainer, VFIOContainer, bcontainer); - if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - return vfio_container_group_add(container, group, errp); + if (!cpr_is_incoming()) { + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + return vfio_container_group_add(container, group, errp); + } } - } - fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); - if (fd < 0) { - goto fail; + fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); + if (fd < 0) { + goto fail; + } + } else { + /* + * For incoming CPR, the group is already attached in the kernel. + * If a container with matching fd is found, then update the + * userland group list and return. If not, then after the loop, + * create the container struct and group list. + */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); + + if (vfio_cpr_container_match(container, group, fd)) { + return vfio_container_group_add(container, group, errp); + } + } } ret = ioctl(fd, VFIO_GET_API_VERSION); @@ -642,7 +674,7 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, new_container = true; bcontainer = &container->bcontainer; - if (!vfio_cpr_register_container(bcontainer, errp)) { + if (!vfio_legacy_cpr_register_container(container, errp)) { goto fail; } @@ -660,8 +692,17 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, } group_was_added = true; - if (!vfio_listener_register(bcontainer, errp)) { - goto fail; + /* + * If CPR, register the listener later, after all state that may + * affect regions and mapping boundaries has been cpr load'ed. Later, + * the listener will invoke its callback on each flat section and call + * dma_map to supply the new vaddr, and the calls will match the mappings + * remembered by the kernel. + */ + if (!cpr_is_incoming()) { + if (!vfio_listener_register(bcontainer, errp)) { + goto fail; + } } bcontainer->initialized = true; @@ -678,7 +719,7 @@ fail: vioc->release(bcontainer); } if (new_container) { - vfio_cpr_unregister_container(bcontainer); + vfio_legacy_cpr_unregister_container(container); object_unref(container); } if (fd >= 0) { @@ -697,6 +738,7 @@ static void vfio_container_disconnect(VFIOGroup *group) QLIST_REMOVE(group, container_next); group->container = NULL; + cpr_delete_fd("vfio_container_for_group", group->groupid); /* * Explicitly release the listener first before unset container, @@ -719,7 +761,7 @@ static void vfio_container_disconnect(VFIOGroup *group) VFIOAddressSpace *space = bcontainer->space; trace_vfio_container_disconnect(container->fd); - vfio_cpr_unregister_container(bcontainer); + vfio_legacy_cpr_unregister_container(container); close(container->fd); object_unref(container); @@ -750,7 +792,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) group = g_malloc0(sizeof(*group)); snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); - group->fd = qemu_open(path, O_RDWR, errp); + group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, errp); if (group->fd < 0) { goto free_group_exit; } @@ -782,6 +824,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) return group; close_fd_exit: + cpr_delete_fd("vfio_group", groupid); close(group->fd); free_group_exit: @@ -803,6 +846,7 @@ static void vfio_group_put(VFIOGroup *group) vfio_container_disconnect(group); QLIST_REMOVE(group, next); trace_vfio_group_put(group->fd); + cpr_delete_fd("vfio_group", group->groupid); close(group->fd); g_free(group); } @@ -813,7 +857,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, g_autofree struct vfio_device_info *info = NULL; int fd; - fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + fd = vfio_cpr_group_get_device_fd(group->fd, name); if (fd < 0) { error_setg_errno(errp, errno, "error getting device from group %d", group->groupid); @@ -826,8 +870,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, info = vfio_get_device_info(fd); if (!info) { error_setg_errno(errp, errno, "error getting device info"); - close(fd); - return false; + goto fail; } /* @@ -841,8 +884,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, if (!QLIST_EMPTY(&group->device_list)) { error_setg(errp, "Inconsistent setting of support for discarding " "RAM (e.g., balloon) within group"); - close(fd); - return false; + goto fail; } if (!group->ram_block_discard_allowed) { @@ -860,6 +902,11 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs); return true; + +fail: + close(fd); + cpr_delete_fd(name, 0); + return false; } static void vfio_device_put(VFIODevice *vbasedev) @@ -870,6 +917,7 @@ static void vfio_device_put(VFIODevice *vbasedev) QLIST_REMOVE(vbasedev, next); vbasedev->group = NULL; trace_vfio_device_put(vbasedev->fd); + cpr_delete_fd(vbasedev->name, 0); close(vbasedev->fd); } @@ -939,6 +987,13 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, goto device_put_exit; } + if (vbasedev->mdev) { + error_setg(&vbasedev->cpr.mdev_blocker, + "CPR does not support vfio mdev %s", vbasedev->name); + migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, &error_fatal, + MIG_MODE_CPR_TRANSFER, -1); + } + return true; device_put_exit: @@ -956,6 +1011,7 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev) vfio_device_unprepare(vbasedev); + migrate_del_blocker(&vbasedev->cpr.mdev_blocker); object_unref(vbasedev->hiod); vfio_device_put(vbasedev); vfio_group_put(group); diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c new file mode 100644 index 0000000..a84c324 --- /dev/null +++ b/hw/vfio/cpr-legacy.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2021-2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include <sys/ioctl.h> +#include <linux/vfio.h> +#include "qemu/osdep.h" +#include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-listener.h" +#include "migration/blocker.h" +#include "migration/cpr.h" +#include "migration/migration.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "qemu/error-report.h" + +static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL, + .iova = 0, + .size = 0, + }; + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all"); + return false; + } + container->cpr.vaddr_unmapped = true; + return true; +} + +/* + * Set the new @vaddr for any mappings registered during cpr load. + * The incoming state is cleared thereafter. + */ +static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, void *vaddr, + bool readonly, MemoryRegion *mr) +{ + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_VADDR, + .vaddr = (__u64)(uintptr_t)vaddr, + .iova = iova, + .size = size, + }; + + g_assert(cpr_is_incoming()); + + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { + return -errno; + } + + return 0; +} + +static void vfio_region_remap(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + cpr.remap_listener); + vfio_container_region_add(&container->bcontainer, section, true); +} + +static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) +{ + if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) { + error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR"); + return false; + + } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) { + error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL"); + return false; + + } else { + return true; + } +} + +static int vfio_container_pre_save(void *opaque) +{ + VFIOContainer *container = opaque; + Error *local_err = NULL; + + if (!vfio_dma_unmap_vaddr_all(container, &local_err)) { + error_report_err(local_err); + return -1; + } + return 0; +} + +static int vfio_container_post_load(void *opaque, int version_id) +{ + VFIOContainer *container = opaque; + VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOGroup *group; + Error *local_err = NULL; + + if (!vfio_listener_register(bcontainer, &local_err)) { + error_report_err(local_err); + return -1; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + + /* Restore original dma_map function */ + vioc->dma_map = container->cpr.saved_dma_map; + } + return 0; +} + +static const VMStateDescription vfio_container_vmstate = { + .name = "vfio-container", + .version_id = 0, + .minimum_version_id = 0, + .priority = MIG_PRI_LOW, /* Must happen after devices and groups */ + .pre_save = vfio_container_pre_save, + .post_load = vfio_container_post_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) +{ + VFIOContainer *container = + container_of(notifier, VFIOContainer, cpr.transfer_notifier); + VFIOContainerBase *bcontainer = &container->bcontainer; + + if (e->type != MIG_EVENT_PRECOPY_FAILED) { + return 0; + } + + if (container->cpr.vaddr_unmapped) { + /* + * Force a call to vfio_region_remap for each mapped section by + * temporarily registering a listener, and temporarily diverting + * dma_map to vfio_legacy_cpr_dma_map. The latter restores vaddr. + */ + + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + vioc->dma_map = vfio_legacy_cpr_dma_map; + + container->cpr.remap_listener = (MemoryListener) { + .name = "vfio cpr recover", + .region_add = vfio_region_remap + }; + memory_listener_register(&container->cpr.remap_listener, + bcontainer->space->as); + memory_listener_unregister(&container->cpr.remap_listener); + container->cpr.vaddr_unmapped = false; + vioc->dma_map = container->cpr.saved_dma_map; + } + return 0; +} + +bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + Error **cpr_blocker = &container->cpr.blocker; + + migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, + vfio_cpr_reboot_notifier, + MIG_MODE_CPR_REBOOT); + + if (!vfio_cpr_supported(container, cpr_blocker)) { + return migrate_add_blocker_modes(cpr_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1) == 0; + } + + vmstate_register(NULL, -1, &vfio_container_vmstate, container); + + /* During incoming CPR, divert calls to dma_map. */ + if (cpr_is_incoming()) { + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + container->cpr.saved_dma_map = vioc->dma_map; + vioc->dma_map = vfio_legacy_cpr_dma_map; + } + + migration_add_notifier_mode(&container->cpr.transfer_notifier, + vfio_cpr_fail_notifier, + MIG_MODE_CPR_TRANSFER); + return true; +} + +void vfio_legacy_cpr_unregister_container(VFIOContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + migration_remove_notifier(&bcontainer->cpr_reboot_notifier); + migrate_del_blocker(&container->cpr.blocker); + vmstate_unregister(NULL, &vfio_container_vmstate, container); + migration_remove_notifier(&container->cpr.transfer_notifier); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a giommu. + * + * The giommu already exists. Find it and replay it, which calls + * vfio_legacy_cpr_dma_map further down the stack. + */ +void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIOGuestIOMMU *giommu = NULL; + hwaddr as_offset = section->offset_within_address_space; + hwaddr iommu_offset = as_offset - section->offset_within_region; + + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) && + giommu->iommu_offset == iommu_offset) { + break; + } + } + g_assert(giommu); + memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a RamDiscardManager. + * + * The ram discard listener already exists. Call its populate function + * directly, which calls vfio_legacy_cpr_dma_map. + */ +bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); + + g_assert(vrdl); + return vrdl->listener.notify_populate(&vrdl->listener, section) == 0; +} + +int vfio_cpr_group_get_device_fd(int d, const char *name) +{ + const int id = 0; + int fd = cpr_find_fd(name, id); + + if (fd < 0) { + fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name); + if (fd >= 0) { + cpr_save_fd(name, id, fd); + } + } + return fd; +} + +static bool same_device(int fd1, int fd2) +{ + struct stat st1, st2; + + return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev; +} + +bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group, + int fd) +{ + if (container->fd == fd) { + return true; + } + if (!same_device(container->fd, fd)) { + return false; + } + /* + * Same device, different fd. This occurs when the container fd is + * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS + * produces duplicates. De-dup it. + */ + cpr_delete_fd("vfio_container_for_group", group->groupid); + close(fd); + cpr_save_fd("vfio_container_for_group", group->groupid, container->fd); + return true; +} diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 3214184..fdbb58e 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -7,13 +7,14 @@ #include "qemu/osdep.h" #include "hw/vfio/vfio-device.h" -#include "migration/misc.h" +#include "hw/vfio/vfio-cpr.h" +#include "hw/vfio/pci.h" +#include "migration/cpr.h" #include "qapi/error.h" #include "system/runstate.h" -#include "vfio-cpr.h" -static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, - MigrationEvent *e, Error **errp) +int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) { if (e->type == MIG_EVENT_PRECOPY_SETUP && !runstate_check(RUN_STATE_SUSPENDED) && !vm_get_suspended()) { @@ -38,3 +39,32 @@ void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer) { migration_remove_notifier(&bcontainer->cpr_reboot_notifier); } + +/* + * The kernel may change non-emulated config bits. Exclude them from the + * changed-bits check in get_pci_config_device. + */ +static int vfio_cpr_pci_pre_load(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int size = MIN(pci_config_size(pdev), vdev->config_size); + int i; + + for (i = 0; i < size; i++) { + pdev->cmask[i] &= vdev->emulated_config_bits[i]; + } + + return 0; +} + +const VMStateDescription vfio_cpr_pci_vmstate = { + .name = "vfio-cpr-pci", + .version_id = 0, + .minimum_version_id = 0, + .pre_load = vfio_cpr_pci_pre_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 9fba2c7..d32600e 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -200,6 +200,7 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info) { size_t argsz = sizeof(struct vfio_region_info); + int fd = -1; int ret; /* check cache */ @@ -214,7 +215,7 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, retry: (*info)->argsz = argsz; - ret = vbasedev->io_ops->get_region_info(vbasedev, *info); + ret = vbasedev->io_ops->get_region_info(vbasedev, *info, &fd); if (ret != 0) { g_free(*info); *info = NULL; @@ -225,11 +226,19 @@ retry: argsz = (*info)->argsz; *info = g_realloc(*info, argsz); + if (fd != -1) { + close(fd); + fd = -1; + } + goto retry; } /* fill cache */ vbasedev->reginfo[index] = *info; + if (vbasedev->region_fds != NULL) { + vbasedev->region_fds[index] = fd; + } return 0; } @@ -334,6 +343,7 @@ void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, vbasedev->io_ops = &vfio_device_io_ops_ioctl; vbasedev->dev = dev; vbasedev->fd = -1; + vbasedev->use_region_fds = false; vbasedev->ram_block_discard_allowed = ram_discard; } @@ -444,6 +454,9 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, vbasedev->reginfo = g_new0(struct vfio_region_info *, vbasedev->num_regions); + if (vbasedev->use_region_fds) { + vbasedev->region_fds = g_new0(int, vbasedev->num_regions); + } } void vfio_device_unprepare(VFIODevice *vbasedev) @@ -452,9 +465,14 @@ void vfio_device_unprepare(VFIODevice *vbasedev) for (i = 0; i < vbasedev->num_regions; i++) { g_free(vbasedev->reginfo[i]); + if (vbasedev->region_fds != NULL && vbasedev->region_fds[i] != -1) { + close(vbasedev->region_fds[i]); + } + } - g_free(vbasedev->reginfo); - vbasedev->reginfo = NULL; + + g_clear_pointer(&vbasedev->reginfo, g_free); + g_clear_pointer(&vbasedev->region_fds, g_free); QLIST_REMOVE(vbasedev, container_next); QLIST_REMOVE(vbasedev, global_next); @@ -476,10 +494,13 @@ static int vfio_device_io_device_feature(VFIODevice *vbasedev, } static int vfio_device_io_get_region_info(VFIODevice *vbasedev, - struct vfio_region_info *info) + struct vfio_region_info *info, + int *fd) { int ret; + *fd = -1; + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); return ret < 0 ? -errno : ret; @@ -522,7 +543,8 @@ static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index, } static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index, - off_t off, uint32_t size, void *data) + off_t off, uint32_t size, void *data, + bool post) { struct vfio_region_info *info; int ret; diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index e7952d1..e7a9d1f 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -187,23 +187,21 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, } static bool vfio_pci_igd_opregion_detect(VFIOPCIDevice *vdev, - struct vfio_region_info **opregion, - Error **errp) + struct vfio_region_info **opregion) { int ret; - /* Hotplugging is not supported for opregion access */ - if (vdev->pdev.qdev.hotplugged) { - error_setg(errp, "IGD OpRegion is not supported on hotplugged device"); - return false; - } - ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, opregion); if (ret) { - error_setg_errno(errp, -ret, - "Device does not supports IGD OpRegion feature"); + return false; + } + + /* Hotplugging is not supported for opregion access */ + if (vdev->pdev.qdev.hotplugged) { + warn_report("IGD device detected, but OpRegion is not supported " + "on hotplugged device."); return false; } @@ -524,7 +522,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) } /* IGD device always comes with OpRegion */ - if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { return true; } info_report("OpRegion detected on Intel display %x.", vdev->device_id); @@ -695,7 +693,7 @@ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) return true; } - if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { /* Should never reach here, KVMGT always emulates OpRegion */ return false; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index af1c7ab..d3efef7 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -21,20 +21,21 @@ #include "qapi/error.h" #include "system/iommufd.h" #include "hw/qdev-core.h" +#include "hw/vfio/vfio-cpr.h" #include "system/reset.h" #include "qemu/cutils.h" #include "qemu/chardev_open.h" #include "pci.h" #include "vfio-iommufd.h" #include "vfio-helpers.h" -#include "vfio-cpr.h" #include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); @@ -592,6 +593,10 @@ found_container: goto err_listener_register; } + /* + * Do not move this code before attachment! The nested IOMMU support + * needs device and hwpt id which are generated only after attachment. + */ if (!vfio_device_hiod_create_and_realize(vbasedev, TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, errp)) { goto err_listener_register; @@ -810,21 +815,38 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool +host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); +} + +static bool +host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); +} + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, Error **errp) { VFIODevice *vdev = opaque; + HostIOMMUDeviceIOMMUFD *idev; HostIOMMUDeviceCaps *caps = &hiod->caps; + VendorCaps *vendor_caps = &caps->vendor_caps; enum iommu_hw_info_type type; - union { - struct iommu_hw_info_vtd vtd; - } data; uint64_t hw_caps; hiod->agent = opaque; - if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, + vendor_caps, sizeof(*vendor_caps), &hw_caps, errp)) { return false; } @@ -833,6 +855,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, caps->type = type; caps->hw_caps = hw_caps; + idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + idev->iommufd = vdev->iommufd; + idev->devid = vdev->devid; + idev->hwpt_id = vdev->hwpt->hwpt_id; + return true; } @@ -858,10 +885,14 @@ hiod_iommufd_vfio_get_page_size_mask(HostIOMMUDevice *hiod) static void hiod_iommufd_vfio_class_init(ObjectClass *oc, const void *data) { HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; hiodc->get_iova_ranges = hiod_iommufd_vfio_get_iova_ranges; hiodc->get_page_size_mask = hiod_iommufd_vfio_get_page_size_mask; + + idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; + idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; }; static const TypeInfo types[] = { diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index bfacb3d..f498e23 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -90,16 +90,17 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } -/* Called with rcu_read_lock held. */ -static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - Error **errp) +/* + * Called with rcu_read_lock held. + * The returned MemoryRegion must not be accessed after calling rcu_read_unlock. + */ +static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp) { - bool ret, mr_has_discard_manager; + MemoryRegion *mr; - ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, - &mr_has_discard_manager, errp); - if (ret && mr_has_discard_manager) { + mr = memory_translate_iotlb(iotlb, xlat_p, errp); + if (mr && memory_region_has_ram_discard_manager(mr)) { /* * Malicious VMs might trigger discarding of IOMMU-mapped memory. The * pages will remain pinned inside vfio until unmapped, resulting in a @@ -118,7 +119,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, " intended via an IOMMU. It's possible to mitigate " " by setting/adjusting RLIMIT_MEMLOCK."); } - return ret; + return mr; } static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) @@ -126,6 +127,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; + MemoryRegion *mr; + hwaddr xlat; void *vaddr; int ret; Error *local_err = NULL; @@ -150,10 +153,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); goto out; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + /* * vaddr is only valid until rcu_read_unlock(). But after * vfio_dma_map has set up the mapping the pages will be @@ -163,7 +170,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) */ ret = vfio_container_dma_map(bcontainer, iova, iotlb->addr_mask + 1, vaddr, - read_only); + read_only, mr); if (ret) { error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -233,7 +240,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, vaddr = memory_region_get_ram_ptr(section->mr) + start; ret = vfio_container_dma_map(bcontainer, iova, next - start, - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -430,7 +437,7 @@ static void vfio_listener_commit(MemoryListener *listener) listener); void (*listener_commit)(VFIOContainerBase *bcontainer); - listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin; + listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_commit; if (listener_commit) { listener_commit(bcontainer); @@ -449,11 +456,38 @@ static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) } } +VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainerBase *bcontainer, MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + /* does not return */ + } + return vrdl; +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, listener); + vfio_container_region_add(bcontainer, section, false); +} + +void vfio_container_region_add(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + bool cpr_remap) +{ hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -489,6 +523,11 @@ static void vfio_listener_region_add(MemoryListener *listener, int iommu_idx; trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); + + if (cpr_remap) { + vfio_cpr_giommu_remap(bcontainer, section); + } + /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this @@ -531,7 +570,12 @@ static void vfio_listener_region_add(MemoryListener *listener, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_ram_discard_register_listener(bcontainer, section); + if (!cpr_remap) { + vfio_ram_discard_register_listener(bcontainer, section); + } else if (!vfio_cpr_ram_discard_register_listener(bcontainer, + section)) { + goto fail; + } return; } @@ -557,7 +601,7 @@ static void vfio_listener_region_add(MemoryListener *listener, } ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -1010,6 +1054,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) ram_addr_t translated_addr; Error *local_err = NULL; int ret = -EINVAL; + MemoryRegion *mr; + hwaddr xlat; trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); @@ -1021,9 +1067,11 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) } rcu_read_lock(); - if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { goto out_unlock; } + translated_addr = memory_region_get_ram_addr(mr) + xlat; ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, translated_addr, &local_err); @@ -1075,19 +1123,8 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to sync missing RAM discard listener"); - } + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); /* * We only want/can synchronize the bitmap for actually mapped parts - diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index bccb050..73d29f9 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -21,6 +21,7 @@ system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c')) system_ss.add(when: 'CONFIG_VFIO', if_true: files( 'cpr.c', + 'cpr-legacy.c', 'device.c', 'migration.c', 'migration-multifd.c', diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index a1bfdfe..fa25bde 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -30,6 +30,7 @@ #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" #include "migration/vmstate.h" +#include "migration/cpr.h" #include "qobject/qdict.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -56,6 +57,23 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static void vfio_msi_disable_common(VFIOPCIDevice *vdev); +static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr, Error **errp) +{ + int ret = event_notifier_init(e, 0); + + if (ret) { + error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name); + } + return !ret; +} + +static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + event_notifier_cleanup(e); +} + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -103,7 +121,7 @@ static void vfio_intx_interrupt(void *opaque) } } -static void vfio_intx_eoi(VFIODevice *vbasedev) +void vfio_pci_intx_eoi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -111,7 +129,7 @@ static void vfio_intx_eoi(VFIODevice *vbasedev) return; } - trace_vfio_intx_eoi(vbasedev->name); + trace_vfio_pci_intx_eoi(vbasedev->name); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); @@ -136,8 +154,7 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) pci_irq_deassert(&vdev->pdev); /* Get an eventfd for resample/unmask */ - if (event_notifier_init(&vdev->intx.unmask, 0)) { - error_setg(errp, "event_notifier_init failed eoi"); + if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) { goto fail; } @@ -169,7 +186,7 @@ fail_vfio: kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, vdev->intx.route.irq); fail_irqfd: - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); fail: qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev); vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); @@ -201,7 +218,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) } /* We only need to close the eventfd for VFIO to cleanup the kernel side */ - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); /* QEMU starts listening for interrupt events. */ qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt), @@ -236,7 +253,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route) } /* Re-enable the interrupt in cased we missed an EOI */ - vfio_intx_eoi(&vdev->vbasedev); + vfio_pci_intx_eoi(&vdev->vbasedev); } static void vfio_intx_routing_notifier(PCIDevice *pdev) @@ -268,7 +285,6 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); Error *err = NULL; int32_t fd; - int ret; if (!pin) { @@ -291,9 +307,8 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) } #endif - ret = event_notifier_init(&vdev->intx.interrupt, 0); - if (ret) { - error_setg_errno(errp, -ret, "event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0, + errp)) { return false; } fd = event_notifier_get_fd(&vdev->intx.interrupt); @@ -302,7 +317,7 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); return false; } @@ -329,13 +344,18 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); vdev->interrupt = VFIO_INT_NONE; trace_vfio_intx_disable(vdev->vbasedev.name); } +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp) +{ + return vfio_intx_enable(vdev, errp); +} + /* * MSI/X */ @@ -460,8 +480,8 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) return ret; } -static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, - int vector_n, bool msix) +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix) { if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { return; @@ -471,13 +491,16 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, vector_n, &vdev->pdev); } -static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr) { + const char *name = "kvm_interrupt"; + if (vector->virq < 0) { return; } - if (event_notifier_init(&vector->kvm_interrupt, 0)) { + if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr, + NULL)) { goto fail_notifier; } @@ -489,19 +512,20 @@ static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) return; fail_kvm: - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr); fail_notifier: kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; } -static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int nr) { kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr); } static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, @@ -511,6 +535,43 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, + unsigned int nr) +{ + Error *err = NULL; + int32_t fd; + + if (vector->virq >= 0) { + fd = event_notifier_get_fd(&vector->kvm_interrupt); + } else { + fd = event_notifier_get_fd(&vector->interrupt); + } + + if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, + VFIO_IRQ_SET_ACTION_TRIGGER, + fd, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); + } +} + +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + PCIDevice *pdev = &vdev->pdev; + Error *local_err = NULL; + + vector->vdev = vdev; + vector->virq = -1; + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr, + &local_err)) { + error_report_err(local_err); + } + vector->use = true; + if (vdev->interrupt == VFIO_INT_MSIX) { + msix_vector_use(pdev, nr); + } +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { @@ -524,13 +585,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vector = &vdev->msi_vectors[nr]; if (!vector->use) { - vector->vdev = vdev; - vector->virq = -1; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); - } - vector->use = true; - msix_vector_use(pdev, nr); + vfio_pci_vector_init(vdev, nr); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -542,19 +597,19 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, */ if (vector->virq >= 0) { if (!msg) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, nr); } else { vfio_update_kvm_msi_virq(vector, *msg, pdev); } } else { if (msg) { if (vdev->defer_kvm_irq_routing) { - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); } else { vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); kvm_irqchip_commit_route_changes(&vfio_route_change); - vfio_connect_kvm_msi_virq(vector); + vfio_connect_kvm_msi_virq(vector, nr); } } } @@ -583,21 +638,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, strerror(-ret)); } } else { - Error *err = NULL; - int32_t fd; - - if (vector->virq >= 0) { - fd = event_notifier_get_fd(&vector->kvm_interrupt); - } else { - fd = event_notifier_get_fd(&vector->interrupt); - } - - if (!vfio_device_irq_set_signaling(&vdev->vbasedev, - VFIO_PCI_MSIX_IRQ_INDEX, nr, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, - &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - } + set_irq_signalling(&vdev->vbasedev, vector, nr); } } @@ -645,14 +686,14 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) } } -static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { assert(!vdev->defer_kvm_irq_routing); vdev->defer_kvm_irq_routing = true; vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); } -static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { int i; @@ -662,7 +703,7 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) kvm_irqchip_commit_route_changes(&vfio_route_change); for (i = 0; i < vdev->nr_vectors; i++) { - vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]); + vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i); } } @@ -682,14 +723,14 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) * routes once rather than per vector provides a substantial * performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, vfio_msix_vector_release, NULL)) { error_report("vfio: msix_set_vector_notifiers failed"); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); if (vdev->nr_vectors) { ret = vfio_enable_vectors(vdev, true); @@ -733,19 +774,21 @@ retry: * Deferring to commit the KVM routes once rather than per vector * provides a substantial performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; + Error *local_err = NULL; vector->vdev = vdev; vector->virq = -1; vector->use = true; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i, + &local_err)) { + error_report_err(local_err); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -755,10 +798,10 @@ retry: * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vfio_add_kvm_msi_virq(vdev, vector, i, false); + vfio_pci_add_kvm_msi_virq(vdev, vector, i, false); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); /* Set interrupt type prior to possible interrupts */ vdev->interrupt = VFIO_INT_MSI; @@ -801,11 +844,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vdev->msi_vectors[i].use) { if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } } @@ -984,7 +1027,7 @@ static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset, { return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, - offset, size, data); + offset, size, data, false); } static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) @@ -1738,7 +1781,7 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) return true; } -static void vfio_teardown_msi(VFIOPCIDevice *vdev) +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev) { msi_uninit(&vdev->pdev); @@ -1788,6 +1831,9 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); bar->size = bar->region.size; + + /* IO regions are sync, memory can be async */ + bar->region.post_wr = (bar->ioport == 0); } static void vfio_bars_prepare(VFIOPCIDevice *vdev) @@ -1834,7 +1880,7 @@ static void vfio_bars_register(VFIOPCIDevice *vdev) } } -static void vfio_bars_exit(VFIOPCIDevice *vdev) +void vfio_pci_bars_exit(VFIOPCIDevice *vdev) { int i; @@ -2425,7 +2471,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) g_free(config); } -static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; @@ -2701,7 +2747,7 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, - .vfio_eoi = vfio_intx_eoi, + .vfio_eoi = vfio_pci_intx_eoi, .vfio_get_object = vfio_pci_get_object, .vfio_save_config = vfio_pci_save_config, .vfio_load_config = vfio_pci_load_config, @@ -2772,7 +2818,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) return true; } -static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info *reg_info = NULL; @@ -2818,7 +2864,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return false; } - trace_vfio_populate_device_config(vdev->vbasedev.name, + trace_vfio_pci_populate_device_config(vdev->vbasedev.name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); @@ -2840,7 +2886,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ - trace_vfio_populate_device_get_irq_info_failure(strerror(-ret)); + trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret)); } else if (irq_info.count == 1) { vdev->pci_aer = true; } else { @@ -2852,8 +2898,20 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_pci_put_device(VFIOPCIDevice *vdev) +void vfio_pci_put_device(VFIOPCIDevice *vdev) { + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* + * XXX Leaking igd_opregion is not an oversight, we can't remove the + * fw_cfg entry therefore leaking this allocation seems like the safest + * option. + * + * g_free(vdev->igd_opregion); + */ + vfio_device_detach(&vdev->vbasedev); g_free(vdev->vbasedev.name); @@ -2888,7 +2946,7 @@ static void vfio_err_notifier_handler(void *opaque) * and continue after disabling error recovery support for the * device. */ -static void vfio_register_err_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) { Error *err = NULL; int32_t fd; @@ -2897,8 +2955,9 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->err_notifier, 0)) { - error_report("vfio: Unable to init event notifier for error detection"); + if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0, + &err)) { + error_report_err(err); vdev->pci_aer = false; return; } @@ -2910,7 +2969,7 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); vdev->pci_aer = false; } } @@ -2929,7 +2988,7 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); } static void vfio_req_notifier_handler(void *opaque) @@ -2947,7 +3006,7 @@ static void vfio_req_notifier_handler(void *opaque) } } -static void vfio_register_req_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) { struct vfio_irq_info irq_info; Error *err = NULL; @@ -2964,8 +3023,9 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->req_notifier, 0)) { - error_report("vfio: Unable to init event notifier for device request"); + if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0, + &err)) { + error_report_err(err); return; } @@ -2976,7 +3036,7 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); } else { vdev->req_enabled = true; } @@ -2996,15 +3056,28 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); vdev->req_enabled = false; } -static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; VFIODevice *vbasedev = &vdev->vbasedev; + uint32_t config_space_size; + int ret; + + config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); + + /* Get a copy of config space */ + ret = vfio_pci_config_space_read(vdev, 0, config_space_size, + vdev->pdev.config); + if (ret < (int)config_space_size) { + ret = ret < 0 ? -ret : EFAULT; + error_setg_errno(errp, ret, "failed to read device config space"); + return false; + } /* vfio emulates a lot for us, but some bits need extra love */ vdev->emulated_config_bits = g_malloc0(vdev->config_size); @@ -3094,7 +3167,7 @@ static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) return true; } -static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; @@ -3126,15 +3199,14 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_realize(PCIDevice *pdev, Error **errp) +static void vfio_pci_realize(PCIDevice *pdev, Error **errp) { ERRP_GUARD(); VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; - int i, ret; + int i; char uuid[UUID_STR_LEN]; g_autofree char *name = NULL; - uint32_t config_space_size; if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || @@ -3185,18 +3257,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } - if (!vfio_populate_device(vdev, errp)) { - goto error; - } - - config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); - - /* Get a copy of config space */ - ret = vfio_pci_config_space_read(vdev, 0, config_space_size, - vdev->pdev.config); - if (ret < (int)config_space_size) { - ret = ret < 0 ? -ret : EFAULT; - error_setg_errno(errp, ret, "failed to read device config space"); + if (!vfio_pci_populate_device(vdev, errp)) { goto error; } @@ -3210,7 +3271,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto out_teardown; } - if (!vfio_add_capabilities(vdev, errp)) { + if (!vfio_pci_add_capabilities(vdev, errp)) { goto out_unset_idev; } @@ -3226,7 +3287,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bar_quirk_setup(vdev, i); } - if (!vfio_interrupt_setup(vdev, errp)) { + if (!vfio_pci_interrupt_setup(vdev, errp)) { goto out_unset_idev; } @@ -3270,8 +3331,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } - vfio_register_err_notifier(vdev); - vfio_register_req_notifier(vdev); + vfio_pci_register_err_notifier(vdev); + vfio_pci_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); return; @@ -3292,8 +3353,8 @@ out_unset_idev: pci_device_unset_iommu_device(pdev); } out_teardown: - vfio_teardown_msi(vdev); - vfio_bars_exit(vdev); + vfio_pci_teardown_msi(vdev); + vfio_pci_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); } @@ -3302,17 +3363,6 @@ static void vfio_instance_finalize(Object *obj) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); - vfio_display_finalize(vdev); - vfio_bars_finalize(vdev); - g_free(vdev->emulated_config_bits); - g_free(vdev->rom); - /* - * XXX Leaking igd_opregion is not an oversight, we can't remove the - * fw_cfg entry therefore leaking this allocation seems like the safest - * option. - * - * g_free(vdev->igd_opregion); - */ vfio_pci_put_device(vdev); } @@ -3331,9 +3381,9 @@ static void vfio_exitfn(PCIDevice *pdev) if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } - vfio_teardown_msi(vdev); + vfio_pci_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); - vfio_bars_exit(vdev); + vfio_pci_bars_exit(vdev); vfio_migration_exit(vbasedev); if (!vbasedev->mdev) { pci_device_unset_iommu_device(pdev); @@ -3344,6 +3394,11 @@ static void vfio_pci_reset(DeviceState *dev) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); + /* Do not reset the device during qemu_system_reset prior to cpr load */ + if (cpr_is_incoming()) { + return; + } + trace_vfio_pci_reset(vdev->vbasedev.name); vfio_pci_pre_reset(vdev); @@ -3401,6 +3456,13 @@ static void vfio_instance_init(Object *obj) /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command * line, therefore, no need to wait to realize like other devices */ pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; + + /* + * A device that is resuming for cpr is already configured, so do not + * reset it during qemu_system_reset prior to cpr load, else interrupts + * may be lost. + */ + pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR; } static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) @@ -3418,7 +3480,7 @@ static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) static const TypeInfo vfio_pci_base_dev_info = { .name = TYPE_VFIO_PCI_BASE, .parent = TYPE_PCI_DEVICE, - .instance_size = 0, + .instance_size = sizeof(VFIOPCIDevice), .abstract = true, .class_init = vfio_pci_base_dev_class_init, .interfaces = (const InterfaceInfo[]) { @@ -3513,8 +3575,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) #ifdef CONFIG_IOMMUFD object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif + dc->vmsd = &vfio_cpr_pci_vmstate; dc->desc = "VFIO-based PCI device assignment"; - pdc->realize = vfio_realize; + pdc->realize = vfio_pci_realize; object_class_property_set_description(klass, /* 1.3 */ "host", @@ -3640,7 +3703,6 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) static const TypeInfo vfio_pci_dev_info = { .name = TYPE_VFIO_PCI, .parent = TYPE_VFIO_PCI_BASE, - .instance_size = sizeof(VFIOPCIDevice), .class_init = vfio_pci_dev_class_init, .instance_init = vfio_instance_init, .instance_finalize = vfio_instance_finalize, diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 5ce0fb9..d3dc227 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -210,6 +210,14 @@ static inline bool vfio_is_vga(VFIOPCIDevice *vdev) return class == PCI_CLASS_DISPLAY_VGA; } +/* MSI/MSI-X/INTx */ +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr); +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix); +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp); + uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len); @@ -248,4 +256,15 @@ void vfio_display_finalize(VFIOPCIDevice *vdev); extern const VMStateDescription vfio_display_vmstate; +void vfio_pci_bars_exit(VFIOPCIDevice *vdev); +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp); +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp); +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_intx_eoi(VFIODevice *vbasedev); +void vfio_pci_put_device(VFIOPCIDevice *vdev); +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev); +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev); +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev); + #endif /* HW_VFIO_VFIO_PCI_H */ diff --git a/hw/vfio/region.c b/hw/vfio/region.c index 34752c3..f5b8e3c 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -66,7 +66,7 @@ void vfio_region_write(void *opaque, hwaddr addr, } ret = vbasedev->io_ops->region_write(vbasedev, region->nr, - addr, size, &buf); + addr, size, &buf, region->post_wr); if (ret != size) { error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 ",%d) failed: %s", @@ -200,6 +200,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, region->size = info->size; region->fd_offset = info->offset; region->nr = index; + region->post_wr = false; if (region->size) { region->mem = g_new0(MemoryRegion, 1); @@ -241,6 +242,7 @@ int vfio_region_mmap(VFIORegion *region) { int i, ret, prot = 0; char *name; + int fd; if (!region->mem) { return 0; @@ -271,14 +273,18 @@ int vfio_region_mmap(VFIORegion *region) goto no_mmap; } + /* Use the per-region fd if set, or the shared fd. */ + fd = region->vbasedev->region_fds ? + region->vbasedev->region_fds[region->nr] : + region->vbasedev->fd, + map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); munmap(map_base, map_align - map_base); munmap(map_align + region->mmaps[i].size, align - (map_align - map_base)); region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, - MAP_SHARED | MAP_FIXED, - region->vbasedev->fd, + MAP_SHARED | MAP_FIXED, fd, region->fd_offset + region->mmaps[i].offset); if (region->mmaps[i].mmap == MAP_FAILED) { diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index e90ec9b..f06236f 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -2,7 +2,7 @@ # pci.c vfio_intx_interrupt(const char *name, char line) " (%s) Pin %c" -vfio_intx_eoi(const char *name) " (%s) EOI" +vfio_pci_intx_eoi(const char *name) " (%s) EOI" vfio_intx_enable_kvm(const char *name) " (%s) KVM INTx accel enabled" vfio_intx_disable_kvm(const char *name) " (%s) KVM INTx accel disabled" vfio_intx_update(const char *name, int new_irq, int target_irq) " (%s) IRQ moved %d -> %d" @@ -35,8 +35,8 @@ vfio_pci_hot_reset(const char *name, const char *type) " (%s) %s" vfio_pci_hot_reset_has_dep_devices(const char *name) "%s: hot reset dependent devices:" vfio_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int group_id) "\t%04x:%02x:%02x.%x group %d" vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %s" -vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" -vfio_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" +vfio_pci_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" +vfio_pci_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d" vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x" vfio_pci_reset(const char *name) " (%s)" diff --git a/hw/vfio/vfio-cpr.h b/hw/vfio/vfio-cpr.h deleted file mode 100644 index 134b83a..0000000 --- a/hw/vfio/vfio-cpr.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * VFIO CPR - * - * Copyright (c) 2025 Oracle and/or its affiliates. - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ - -#ifndef HW_VFIO_CPR_H -#define HW_VFIO_CPR_H - -bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp); -void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer); - -#endif /* HW_VFIO_CPR_H */ diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index e20da95..7061b6e 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -209,6 +209,8 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) int ret; Int128 llend; Error *local_err = NULL; + MemoryRegion *mr; + hwaddr xlat; if (iotlb->target_as != &address_space_memory) { error_report("Wrong target AS \"%s\", only system memory is allowed", @@ -228,11 +230,14 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, - &local_err)) { + mr = memory_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); return; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova, iotlb->addr_mask + 1, vaddr, read_only); if (ret) { |