aboutsummaryrefslogtreecommitdiff
path: root/hw
diff options
context:
space:
mode:
Diffstat (limited to 'hw')
-rw-r--r--hw/char/sh_serial.c24
-rw-r--r--hw/display/apple-gfx.m10
-rw-r--r--hw/gpio/pca9552.c2
-rw-r--r--hw/hyperv/hv-balloon.c9
-rw-r--r--hw/intc/loongarch_pch_pic.c2
-rw-r--r--hw/loongarch/boot.c52
-rw-r--r--hw/loongarch/virt-acpi-build.c4
-rw-r--r--hw/loongarch/virt.c27
-rw-r--r--hw/misc/stm32_rcc.c2
-rw-r--r--hw/net/fsl_etsec/etsec.c1
-rw-r--r--hw/net/i82596.c38
-rw-r--r--hw/pci-host/ppce500.c8
-rw-r--r--hw/pci-host/raven.c87
-rw-r--r--hw/pci/msix.c2
-rw-r--r--hw/pci/pci.c5
-rw-r--r--hw/ppc/e500.c26
-rw-r--r--hw/ppc/e500.h4
-rw-r--r--hw/ppc/e500plat.c2
-rw-r--r--hw/ppc/mpc8544ds.c2
-rw-r--r--hw/ppc/prep.c27
-rw-r--r--hw/riscv/riscv-iommu-pci.c6
-rw-r--r--hw/riscv/riscv-iommu-sys.c6
-rw-r--r--hw/s390x/ap-stub.c21
-rw-r--r--hw/s390x/meson.build1
-rw-r--r--hw/s390x/s390-virtio-ccw.c14
-rw-r--r--hw/timer/hpet.c21
-rw-r--r--hw/vfio/ap.c88
-rw-r--r--hw/vfio/container-base.c4
-rw-r--r--hw/vfio/container.c100
-rw-r--r--hw/vfio/cpr-legacy.c287
-rw-r--r--hw/vfio/cpr.c38
-rw-r--r--hw/vfio/device.c32
-rw-r--r--hw/vfio/igd.c22
-rw-r--r--hw/vfio/iommufd.c45
-rw-r--r--hw/vfio/listener.c95
-rw-r--r--hw/vfio/meson.build1
-rw-r--r--hw/vfio/pci.c294
-rw-r--r--hw/vfio/pci.h19
-rw-r--r--hw/vfio/region.c12
-rw-r--r--hw/vfio/trace-events6
-rw-r--r--hw/vfio/vfio-cpr.h15
-rw-r--r--hw/virtio/vhost-vdpa.c9
42 files changed, 1047 insertions, 423 deletions
diff --git a/hw/char/sh_serial.c b/hw/char/sh_serial.c
index 6abd803..30447fa 100644
--- a/hw/char/sh_serial.c
+++ b/hw/char/sh_serial.c
@@ -78,10 +78,6 @@ struct SHSerialState {
qemu_irq bri;
};
-typedef struct {} SHSerialStateClass;
-
-OBJECT_DEFINE_TYPE(SHSerialState, sh_serial, SH_SERIAL, SYS_BUS_DEVICE)
-
static void sh_serial_clear_fifo(SHSerialState *s)
{
memset(s->rx_fifo, 0, SH_RX_FIFO_LENGTH);
@@ -434,17 +430,13 @@ static void sh_serial_realize(DeviceState *d, Error **errp)
s->etu = NANOSECONDS_PER_SECOND / 9600;
}
-static void sh_serial_finalize(Object *obj)
+static void sh_serial_unrealize(DeviceState *dev)
{
- SHSerialState *s = SH_SERIAL(obj);
+ SHSerialState *s = SH_SERIAL(dev);
timer_del(&s->fifo_timeout_timer);
}
-static void sh_serial_init(Object *obj)
-{
-}
-
static const Property sh_serial_properties[] = {
DEFINE_PROP_CHR("chardev", SHSerialState, chr),
DEFINE_PROP_UINT8("features", SHSerialState, feat, 0),
@@ -456,7 +448,19 @@ static void sh_serial_class_init(ObjectClass *oc, const void *data)
device_class_set_props(dc, sh_serial_properties);
dc->realize = sh_serial_realize;
+ dc->unrealize = sh_serial_unrealize;
device_class_set_legacy_reset(dc, sh_serial_reset);
/* Reason: part of SuperH CPU/SoC, needs to be wired up */
dc->user_creatable = false;
}
+
+static const TypeInfo sh_serial_types[] = {
+ {
+ .name = TYPE_SH_SERIAL,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(SHSerialState),
+ .class_init = sh_serial_class_init,
+ },
+};
+
+DEFINE_TYPES(sh_serial_types)
diff --git a/hw/display/apple-gfx.m b/hw/display/apple-gfx.m
index 8dde1f1..174d56a 100644
--- a/hw/display/apple-gfx.m
+++ b/hw/display/apple-gfx.m
@@ -454,7 +454,7 @@ static void set_cursor_glyph(void *opaque)
/* ------ DMA (device reading system memory) ------ */
typedef struct AppleGFXReadMemoryJob {
- QemuSemaphore sem;
+ QemuEvent event;
hwaddr physical_address;
uint64_t length;
void *dst;
@@ -470,7 +470,7 @@ static void apple_gfx_do_read_memory(void *opaque)
job->dst, job->length, MEMTXATTRS_UNSPECIFIED);
job->success = (r == MEMTX_OK);
- qemu_sem_post(&job->sem);
+ qemu_event_set(&job->event);
}
static bool apple_gfx_read_memory(AppleGFXState *s, hwaddr physical_address,
@@ -483,11 +483,11 @@ static bool apple_gfx_read_memory(AppleGFXState *s, hwaddr physical_address,
trace_apple_gfx_read_memory(physical_address, length, dst);
/* Performing DMA requires BQL, so do it in a BH. */
- qemu_sem_init(&job.sem, 0);
+ qemu_event_init(&job.event, 0);
aio_bh_schedule_oneshot(qemu_get_aio_context(),
apple_gfx_do_read_memory, &job);
- qemu_sem_wait(&job.sem);
- qemu_sem_destroy(&job.sem);
+ qemu_event_wait(&job.event);
+ qemu_event_destroy(&job.event);
return job.success;
}
diff --git a/hw/gpio/pca9552.c b/hw/gpio/pca9552.c
index d65c0a2..1e10238 100644
--- a/hw/gpio/pca9552.c
+++ b/hw/gpio/pca9552.c
@@ -76,7 +76,7 @@ static void pca955x_display_pins_status(PCA955xState *s,
return;
}
if (trace_event_get_state_backends(TRACE_PCA955X_GPIO_STATUS)) {
- char *buf = g_newa(char, k->pin_count + 1);
+ char buf[PCA955X_PIN_COUNT_MAX + 1];
for (i = 0; i < k->pin_count; i++) {
if (extract32(pins_status, i, 1)) {
diff --git a/hw/hyperv/hv-balloon.c b/hw/hyperv/hv-balloon.c
index 94b0abb..6dbcb2d 100644
--- a/hw/hyperv/hv-balloon.c
+++ b/hw/hyperv/hv-balloon.c
@@ -67,10 +67,6 @@
* these requests
*/
-struct HvBalloonClass {
- VMBusDeviceClass parent_class;
-} HvBalloonClass;
-
typedef enum State {
/* not a real state */
S_NO_CHANGE = 0,
@@ -162,8 +158,9 @@ typedef struct HvBalloon {
MemoryRegion *mr;
} HvBalloon;
-OBJECT_DEFINE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, HV_BALLOON, VMBUS_DEVICE, \
- { TYPE_MEMORY_DEVICE }, { })
+OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, \
+ HV_BALLOON, VMBUS_DEVICE, \
+ { TYPE_MEMORY_DEVICE }, { })
#define HV_BALLOON_SET_STATE(hvb, news) \
do { \
diff --git a/hw/intc/loongarch_pch_pic.c b/hw/intc/loongarch_pch_pic.c
index cbba2fc..ebb33ed 100644
--- a/hw/intc/loongarch_pch_pic.c
+++ b/hw/intc/loongarch_pch_pic.c
@@ -82,7 +82,7 @@ static uint64_t pch_pic_read(void *opaque, hwaddr addr, uint64_t field_mask)
addr -= offset;
switch (addr) {
case PCH_PIC_INT_ID:
- val = s->id.data;
+ val = cpu_to_le64(s->id.data);
break;
case PCH_PIC_INT_MASK:
val = s->int_mask;
diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c
index 9b6292e..14d6c52 100644
--- a/hw/loongarch/boot.c
+++ b/hw/loongarch/boot.c
@@ -35,12 +35,6 @@ struct loongarch_linux_hdr {
uint32_t pe_header_offset;
} QEMU_PACKED;
-struct memmap_entry *memmap_table;
-unsigned memmap_entries;
-
-ram_addr_t initrd_offset;
-uint64_t initrd_size;
-
static const unsigned int slave_boot_code[] = {
/* Configure reset ebase. */
0x0400302c, /* csrwr $t0, LOONGARCH_CSR_EENTRY */
@@ -94,12 +88,16 @@ static inline void *guidcpy(void *dst, const void *src)
return memcpy(dst, src, sizeof(efi_guid_t));
}
-static void init_efi_boot_memmap(struct efi_system_table *systab,
+static void init_efi_boot_memmap(MachineState *ms,
+ struct efi_system_table *systab,
void *p, void *start)
{
unsigned i;
struct efi_boot_memmap *boot_memmap = p;
efi_guid_t tbl_guid = LINUX_EFI_BOOT_MEMMAP_GUID;
+ LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms);
+ struct memmap_entry *memmap_table;
+ unsigned int memmap_entries;
/* efi_configuration_table 1 */
guidcpy(&systab->tables[0].guid, &tbl_guid);
@@ -111,6 +109,8 @@ static void init_efi_boot_memmap(struct efi_system_table *systab,
boot_memmap->map_size = 0;
efi_memory_desc_t *map = p + sizeof(struct efi_boot_memmap);
+ memmap_table = lvms->memmap_table;
+ memmap_entries = lvms->memmap_entries;
for (i = 0; i < memmap_entries; i++) {
map = (void *)boot_memmap + sizeof(*map);
map[i].type = memmap_table[i].type;
@@ -121,7 +121,8 @@ static void init_efi_boot_memmap(struct efi_system_table *systab,
}
}
-static void init_efi_initrd_table(struct efi_system_table *systab,
+static void init_efi_initrd_table(struct loongarch_boot_info *info,
+ struct efi_system_table *systab,
void *p, void *start)
{
efi_guid_t tbl_guid = LINUX_EFI_INITRD_MEDIA_GUID;
@@ -132,8 +133,8 @@ static void init_efi_initrd_table(struct efi_system_table *systab,
systab->tables[1].table = (struct efi_configuration_table *)(p - start);
systab->nr_tables = 2;
- initrd_table->base = initrd_offset;
- initrd_table->size = initrd_size;
+ initrd_table->base = info->initrd_addr;
+ initrd_table->size = info->initrd_size;
}
static void init_efi_fdt_table(struct efi_system_table *systab)
@@ -146,10 +147,12 @@ static void init_efi_fdt_table(struct efi_system_table *systab)
systab->nr_tables = 3;
}
-static void init_systab(struct loongarch_boot_info *info, void *p, void *start)
+static void init_systab(MachineState *ms,
+ struct loongarch_boot_info *info, void *p, void *start)
{
void *bp_tables_start;
struct efi_system_table *systab = p;
+ LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms);
info->a2 = p - start;
@@ -166,10 +169,10 @@ static void init_systab(struct loongarch_boot_info *info, void *p, void *start)
systab->tables = p;
bp_tables_start = p;
- init_efi_boot_memmap(systab, p, start);
+ init_efi_boot_memmap(ms, systab, p, start);
p += ROUND_UP(sizeof(struct efi_boot_memmap) +
- sizeof(efi_memory_desc_t) * memmap_entries, 64 * KiB);
- init_efi_initrd_table(systab, p, start);
+ sizeof(efi_memory_desc_t) * lvms->memmap_entries, 64 * KiB);
+ init_efi_initrd_table(info, systab, p, start);
p += ROUND_UP(sizeof(struct efi_initrd), 64 * KiB);
init_efi_fdt_table(systab);
@@ -276,8 +279,8 @@ static ram_addr_t alloc_initrd_memory(struct loongarch_boot_info *info,
static int64_t load_kernel_info(struct loongarch_boot_info *info)
{
- uint64_t kernel_entry, kernel_low, kernel_high;
- ssize_t kernel_size;
+ uint64_t kernel_entry, kernel_low, kernel_high, initrd_offset = 0;
+ ssize_t kernel_size, initrd_size;
kernel_size = load_elf(info->kernel_filename, NULL,
cpu_loongarch_virt_to_phys, NULL,
@@ -313,8 +316,9 @@ static int64_t load_kernel_info(struct loongarch_boot_info *info)
info->initrd_filename);
exit(1);
}
- } else {
- initrd_size = 0;
+
+ info->initrd_addr = initrd_offset;
+ info->initrd_size = initrd_size;
}
return kernel_entry;
@@ -369,17 +373,19 @@ static void loongarch_firmware_boot(LoongArchVirtMachineState *lvms,
fw_cfg_add_kernel_info(info, lvms->fw_cfg);
}
-static void init_boot_rom(struct loongarch_boot_info *info, void *p)
+static void init_boot_rom(MachineState *ms,
+ struct loongarch_boot_info *info, void *p)
{
void *start = p;
init_cmdline(info, p, start);
p += COMMAND_LINE_SIZE;
- init_systab(info, p, start);
+ init_systab(ms, info, p, start);
}
-static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info)
+static void loongarch_direct_kernel_boot(MachineState *ms,
+ struct loongarch_boot_info *info)
{
void *p, *bp;
int64_t kernel_addr = VIRT_FLASH0_BASE;
@@ -397,7 +403,7 @@ static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info)
/* Load cmdline and system tables at [0 - 1 MiB] */
p = g_malloc0(1 * MiB);
bp = p;
- init_boot_rom(info, p);
+ init_boot_rom(ms, info, p);
rom_add_blob_fixed_as("boot_info", bp, 1 * MiB, 0, &address_space_memory);
/* Load slave boot code at pflash0 . */
@@ -437,6 +443,6 @@ void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info)
if (lvms->bios_loaded) {
loongarch_firmware_boot(lvms, info);
} else {
- loongarch_direct_kernel_boot(info);
+ loongarch_direct_kernel_boot(ms, info);
}
}
diff --git a/hw/loongarch/virt-acpi-build.c b/hw/loongarch/virt-acpi-build.c
index 073b6de..2cd2d9d 100644
--- a/hw/loongarch/virt-acpi-build.c
+++ b/hw/loongarch/virt-acpi-build.c
@@ -575,8 +575,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine)
acpi_add_table(table_offsets, tables_blob);
{
AcpiMcfgInfo mcfg = {
- .base = cpu_to_le64(VIRT_PCI_CFG_BASE),
- .size = cpu_to_le64(VIRT_PCI_CFG_SIZE),
+ .base = VIRT_PCI_CFG_BASE,
+ .size = VIRT_PCI_CFG_SIZE,
};
build_mcfg(tables_blob, tables->linker, &mcfg, lvms->oem_id,
lvms->oem_table_id);
diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 1b50404..34dfbd1 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -136,6 +136,10 @@ static void virt_build_smbios(LoongArchVirtMachineState *lvms)
return;
}
+ if (kvm_enabled()) {
+ product = "KVM Virtual Machine";
+ }
+
smbios_set_defaults("QEMU", product, mc->name);
smbios_get_tables(ms, SMBIOS_ENTRY_POINT_TYPE_64,
@@ -168,8 +172,15 @@ static void virt_powerdown_req(Notifier *notifier, void *opaque)
acpi_send_event(s->acpi_ged, ACPI_POWER_DOWN_STATUS);
}
-static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type)
+static void memmap_add_entry(MachineState *ms, uint64_t address,
+ uint64_t length, uint32_t type)
{
+ LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms);
+ struct memmap_entry *memmap_table;
+ unsigned int memmap_entries;
+
+ memmap_table = lvms->memmap_table;
+ memmap_entries = lvms->memmap_entries;
/* Ensure there are no duplicate entries. */
for (unsigned i = 0; i < memmap_entries; i++) {
assert(memmap_table[i].address != address);
@@ -182,6 +193,8 @@ static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type)
memmap_table[memmap_entries].type = cpu_to_le32(type);
memmap_table[memmap_entries].reserved = 0;
memmap_entries++;
+ lvms->memmap_table = memmap_table;
+ lvms->memmap_entries = memmap_entries;
}
static DeviceState *create_acpi_ged(DeviceState *pch_pic,
@@ -619,13 +632,13 @@ static void fw_cfg_add_memory(MachineState *ms)
}
if (size >= gap) {
- memmap_add_entry(base, gap, 1);
+ memmap_add_entry(ms, base, gap, 1);
size -= gap;
base = VIRT_HIGHMEM_BASE;
}
if (size) {
- memmap_add_entry(base, size, 1);
+ memmap_add_entry(ms, base, size, 1);
base += size;
}
@@ -640,7 +653,7 @@ static void fw_cfg_add_memory(MachineState *ms)
* lowram: [base, +(gap - numa_info[0].node_mem))
* highram: [VIRT_HIGHMEM_BASE, +(ram_size - gap))
*/
- memmap_add_entry(base, gap - numa_info[0].node_mem, 1);
+ memmap_add_entry(ms, base, gap - numa_info[0].node_mem, 1);
size = ram_size - gap;
base = VIRT_HIGHMEM_BASE;
} else {
@@ -648,7 +661,7 @@ static void fw_cfg_add_memory(MachineState *ms)
}
if (size) {
- memmap_add_entry(base, size, 1);
+ memmap_add_entry(ms, base, size, 1);
}
}
@@ -734,8 +747,8 @@ static void virt_init(MachineState *machine)
rom_set_fw(lvms->fw_cfg);
if (lvms->fw_cfg != NULL) {
fw_cfg_add_file(lvms->fw_cfg, "etc/memmap",
- memmap_table,
- sizeof(struct memmap_entry) * (memmap_entries));
+ lvms->memmap_table,
+ sizeof(struct memmap_entry) * lvms->memmap_entries);
}
/* Initialize the IO interrupt subsystem */
diff --git a/hw/misc/stm32_rcc.c b/hw/misc/stm32_rcc.c
index 94e8dae..5815b3e 100644
--- a/hw/misc/stm32_rcc.c
+++ b/hw/misc/stm32_rcc.c
@@ -60,7 +60,7 @@ static void stm32_rcc_write(void *opaque, hwaddr addr,
uint32_t value = val64;
uint32_t prev_value, new_value, irq_offset;
- trace_stm32_rcc_write(value, addr);
+ trace_stm32_rcc_write(addr, value);
if (addr > STM32_RCC_DCKCFGR2) {
qemu_log_mask(LOG_GUEST_ERROR, "%s: Bad offset 0x%"HWADDR_PRIx"\n",
diff --git a/hw/net/fsl_etsec/etsec.c b/hw/net/fsl_etsec/etsec.c
index d14cb2a..846f6cb 100644
--- a/hw/net/fsl_etsec/etsec.c
+++ b/hw/net/fsl_etsec/etsec.c
@@ -389,6 +389,7 @@ static void etsec_realize(DeviceState *dev, Error **errp)
{
eTSEC *etsec = ETSEC_COMMON(dev);
+ qemu_macaddr_default_if_unset(&etsec->conf.macaddr);
etsec->nic = qemu_new_nic(&net_etsec_info, &etsec->conf,
object_get_typename(OBJECT(dev)), dev->id,
&dev->mem_reentrancy_guard, etsec);
diff --git a/hw/net/i82596.c b/hw/net/i82596.c
index 64ed3c8..c1ff3e6 100644
--- a/hw/net/i82596.c
+++ b/hw/net/i82596.c
@@ -5,7 +5,7 @@
* This work is licensed under the GNU GPL license version 2 or later.
*
* This software was written to be compatible with the specification:
- * https://www.intel.com/assets/pdf/general/82596ca.pdf
+ * https://parisc.docs.kernel.org/en/latest/_downloads/96672be0650d9fc046bbcea40b92482f/82596CA.pdf
*/
#include "qemu/osdep.h"
@@ -177,6 +177,26 @@ static void set_individual_address(I82596State *s, uint32_t addr)
trace_i82596_new_mac(nc->info_str);
}
+static void i82596_configure(I82596State *s, uint32_t addr)
+{
+ uint8_t byte_cnt;
+ byte_cnt = get_byte(addr + 8) & 0x0f;
+
+ byte_cnt = MAX(byte_cnt, 4);
+ byte_cnt = MIN(byte_cnt, sizeof(s->config));
+ /* copy byte_cnt max. */
+ address_space_read(&address_space_memory, addr + 8,
+ MEMTXATTRS_UNSPECIFIED, s->config, byte_cnt);
+ /* config byte according to page 35ff */
+ s->config[2] &= 0x82; /* mask valid bits */
+ s->config[2] |= 0x40;
+ s->config[7] &= 0xf7; /* clear zero bit */
+ assert(I596_NOCRC_INS == 0); /* do CRC insertion */
+ s->config[10] = MAX(s->config[10], 5); /* min frame length */
+ s->config[12] &= 0x40; /* only full duplex field valid */
+ s->config[13] |= 0x3f; /* set ones in byte 13 */
+}
+
static void set_multicast_list(I82596State *s, uint32_t addr)
{
uint16_t mc_count, i;
@@ -234,7 +254,6 @@ static void command_loop(I82596State *s)
{
uint16_t cmd;
uint16_t status;
- uint8_t byte_cnt;
DBG(printf("STARTING COMMAND LOOP cmd_p=%08x\n", s->cmd_p));
@@ -254,20 +273,7 @@ static void command_loop(I82596State *s)
set_individual_address(s, s->cmd_p);
break;
case CmdConfigure:
- byte_cnt = get_byte(s->cmd_p + 8) & 0x0f;
- byte_cnt = MAX(byte_cnt, 4);
- byte_cnt = MIN(byte_cnt, sizeof(s->config));
- /* copy byte_cnt max. */
- address_space_read(&address_space_memory, s->cmd_p + 8,
- MEMTXATTRS_UNSPECIFIED, s->config, byte_cnt);
- /* config byte according to page 35ff */
- s->config[2] &= 0x82; /* mask valid bits */
- s->config[2] |= 0x40;
- s->config[7] &= 0xf7; /* clear zero bit */
- assert(I596_NOCRC_INS == 0); /* do CRC insertion */
- s->config[10] = MAX(s->config[10], 5); /* min frame length */
- s->config[12] &= 0x40; /* only full duplex field valid */
- s->config[13] |= 0x3f; /* set ones in byte 13 */
+ i82596_configure(s, s->cmd_p);
break;
case CmdTDR:
/* get signal LINK */
diff --git a/hw/pci-host/ppce500.c b/hw/pci-host/ppce500.c
index e97a515..52269b0 100644
--- a/hw/pci-host/ppce500.c
+++ b/hw/pci-host/ppce500.c
@@ -16,7 +16,6 @@
#include "qemu/osdep.h"
#include "hw/irq.h"
-#include "hw/ppc/e500-ccsr.h"
#include "hw/qdev-properties.h"
#include "migration/vmstate.h"
#include "hw/pci/pci_device.h"
@@ -418,11 +417,12 @@ static const VMStateDescription vmstate_ppce500_pci = {
static void e500_pcihost_bridge_realize(PCIDevice *d, Error **errp)
{
PPCE500PCIBridgeState *b = PPC_E500_PCI_BRIDGE(d);
- PPCE500CCSRState *ccsr = CCSR(
+ SysBusDevice *ccsr = SYS_BUS_DEVICE(
object_resolve_path_component(qdev_get_machine(), "e500-ccsr"));
+ MemoryRegion *ccsr_space = sysbus_mmio_get_region(ccsr, 0);
- memory_region_init_alias(&b->bar0, OBJECT(ccsr), "e500-pci-bar0", &ccsr->ccsr_space,
- 0, int128_get64(ccsr->ccsr_space.size));
+ memory_region_init_alias(&b->bar0, OBJECT(ccsr), "e500-pci-bar0",
+ ccsr_space, 0, int128_get64(ccsr_space->size));
pci_register_bar(d, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &b->bar0);
}
diff --git a/hw/pci-host/raven.c b/hw/pci-host/raven.c
index 21f7ca6..f8c0be5 100644
--- a/hw/pci-host/raven.c
+++ b/hw/pci-host/raven.c
@@ -24,7 +24,6 @@
*/
#include "qemu/osdep.h"
-#include "qemu/datadir.h"
#include "qemu/units.h"
#include "qemu/log.h"
#include "qapi/error.h"
@@ -35,9 +34,7 @@
#include "migration/vmstate.h"
#include "hw/intc/i8259.h"
#include "hw/irq.h"
-#include "hw/loader.h"
#include "hw/or-irq.h"
-#include "elf.h"
#include "qom/object.h"
#define TYPE_RAVEN_PCI_DEVICE "raven"
@@ -47,10 +44,6 @@ OBJECT_DECLARE_SIMPLE_TYPE(RavenPCIState, RAVEN_PCI_DEVICE)
struct RavenPCIState {
PCIDevice dev;
-
- uint32_t elf_machine;
- char *bios_name;
- MemoryRegion bios;
};
typedef struct PRePPCIState PREPPCIState;
@@ -75,11 +68,8 @@ struct PRePPCIState {
RavenPCIState pci_dev;
int contiguous_map;
- bool is_legacy_prep;
};
-#define BIOS_SIZE (1 * MiB)
-
#define PCI_IO_BASE_ADDR 0x80000000 /* Physical address on main bus */
static inline uint32_t raven_pci_io_config(hwaddr addr)
@@ -243,22 +233,18 @@ static void raven_pcihost_realizefn(DeviceState *d, Error **errp)
MemoryRegion *address_space_mem = get_system_memory();
int i;
- if (s->is_legacy_prep) {
- for (i = 0; i < PCI_NUM_PINS; i++) {
- sysbus_init_irq(dev, &s->pci_irqs[i]);
- }
- } else {
- /* According to PReP specification section 6.1.6 "System Interrupt
- * Assignments", all PCI interrupts are routed via IRQ 15 */
- s->or_irq = OR_IRQ(object_new(TYPE_OR_IRQ));
- object_property_set_int(OBJECT(s->or_irq), "num-lines", PCI_NUM_PINS,
- &error_fatal);
- qdev_realize(DEVICE(s->or_irq), NULL, &error_fatal);
- sysbus_init_irq(dev, &s->or_irq->out_irq);
-
- for (i = 0; i < PCI_NUM_PINS; i++) {
- s->pci_irqs[i] = qdev_get_gpio_in(DEVICE(s->or_irq), i);
- }
+ /*
+ * According to PReP specification section 6.1.6 "System Interrupt
+ * Assignments", all PCI interrupts are routed via IRQ 15
+ */
+ s->or_irq = OR_IRQ(object_new(TYPE_OR_IRQ));
+ object_property_set_int(OBJECT(s->or_irq), "num-lines", PCI_NUM_PINS,
+ &error_fatal);
+ qdev_realize(DEVICE(s->or_irq), NULL, &error_fatal);
+ sysbus_init_irq(dev, &s->or_irq->out_irq);
+
+ for (i = 0; i < PCI_NUM_PINS; i++) {
+ s->pci_irqs[i] = qdev_get_gpio_in(DEVICE(s->or_irq), i);
}
qdev_init_gpio_in(d, raven_change_gpio, 1);
@@ -338,48 +324,9 @@ static void raven_pcihost_initfn(Object *obj)
static void raven_realize(PCIDevice *d, Error **errp)
{
- RavenPCIState *s = RAVEN_PCI_DEVICE(d);
- char *filename;
- int bios_size = -1;
-
d->config[PCI_CACHE_LINE_SIZE] = 0x08;
d->config[PCI_LATENCY_TIMER] = 0x10;
d->config[PCI_CAPABILITY_LIST] = 0x00;
-
- if (!memory_region_init_rom_nomigrate(&s->bios, OBJECT(s), "bios",
- BIOS_SIZE, errp)) {
- return;
- }
- memory_region_add_subregion(get_system_memory(), (uint32_t)(-BIOS_SIZE),
- &s->bios);
- if (s->bios_name) {
- filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, s->bios_name);
- if (filename) {
- if (s->elf_machine != EM_NONE) {
- bios_size = load_elf(filename, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL,
- ELFDATA2MSB, s->elf_machine, 0, 0);
- }
- if (bios_size < 0) {
- bios_size = get_image_size(filename);
- if (bios_size > 0 && bios_size <= BIOS_SIZE) {
- hwaddr bios_addr;
- bios_size = (bios_size + 0xfff) & ~0xfff;
- bios_addr = (uint32_t)(-BIOS_SIZE);
- bios_size = load_image_targphys(filename, bios_addr,
- bios_size);
- }
- }
- }
- g_free(filename);
- if (bios_size < 0 || bios_size > BIOS_SIZE) {
- memory_region_del_subregion(get_system_memory(), &s->bios);
- error_setg(errp, "Could not load bios image '%s'", s->bios_name);
- return;
- }
- }
-
- vmstate_register_ram_global(&s->bios);
}
static const VMStateDescription vmstate_raven = {
@@ -422,22 +369,12 @@ static const TypeInfo raven_info = {
},
};
-static const Property raven_pcihost_properties[] = {
- DEFINE_PROP_UINT32("elf-machine", PREPPCIState, pci_dev.elf_machine,
- EM_NONE),
- DEFINE_PROP_STRING("bios-name", PREPPCIState, pci_dev.bios_name),
- /* Temporary workaround until legacy prep machine is removed */
- DEFINE_PROP_BOOL("is-legacy-prep", PREPPCIState, is_legacy_prep,
- false),
-};
-
static void raven_pcihost_class_init(ObjectClass *klass, const void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
dc->realize = raven_pcihost_realizefn;
- device_class_set_props(dc, raven_pcihost_properties);
dc->fw_name = "pci";
}
diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index 66f27b9..8c7f670 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -72,7 +72,7 @@ static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
return dev->msix_pba + vector / 8;
}
-static int msix_is_pending(PCIDevice *dev, int vector)
+int msix_is_pending(PCIDevice *dev, unsigned int vector)
{
return *msix_pending_byte(dev, vector) & msix_pending_mask(vector);
}
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 9b4bf48..c70b5ce 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -32,6 +32,7 @@
#include "hw/pci/pci_host.h"
#include "hw/qdev-properties.h"
#include "hw/qdev-properties-system.h"
+#include "migration/cpr.h"
#include "migration/qemu-file-types.h"
#include "migration/vmstate.h"
#include "net/net.h"
@@ -537,6 +538,10 @@ static void pci_reset_regions(PCIDevice *dev)
static void pci_do_device_reset(PCIDevice *dev)
{
+ if ((dev->cap_present & QEMU_PCI_SKIP_RESET_ON_CPR) && cpr_is_incoming()) {
+ return;
+ }
+
pci_device_deassert_intx(dev);
assert(dev->irq_state == 0);
diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index 809078a..6899802 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -79,8 +79,6 @@
#define MPC85XX_ESDHC_IRQ 72
#define RTC_REGS_OFFSET 0x68
-#define PLATFORM_CLK_FREQ_HZ (400 * 1000 * 1000)
-
struct boot_info
{
uint32_t dt_base;
@@ -120,7 +118,7 @@ static uint32_t *pci_map_create(void *fdt, uint32_t mpic, int first_slot,
}
static void dt_serial_create(void *fdt, unsigned long long offset,
- const char *soc, const char *mpic,
+ const char *soc, uint32_t freq, const char *mpic,
const char *alias, int idx, bool defcon)
{
char *ser;
@@ -131,7 +129,7 @@ static void dt_serial_create(void *fdt, unsigned long long offset,
qemu_fdt_setprop_string(fdt, ser, "compatible", "ns16550");
qemu_fdt_setprop_cells(fdt, ser, "reg", offset, 0x100);
qemu_fdt_setprop_cell(fdt, ser, "cell-index", idx);
- qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", PLATFORM_CLK_FREQ_HZ);
+ qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", freq);
qemu_fdt_setprop_cells(fdt, ser, "interrupts", 42, 2);
qemu_fdt_setprop_phandle(fdt, ser, "interrupt-parent", mpic);
qemu_fdt_setprop_string(fdt, "/aliases", alias, ser);
@@ -382,8 +380,7 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms,
int fdt_size;
void *fdt;
uint8_t hypercall[16];
- uint32_t clock_freq = PLATFORM_CLK_FREQ_HZ;
- uint32_t tb_freq = PLATFORM_CLK_FREQ_HZ;
+ uint32_t clock_freq, tb_freq;
int i;
char compatible_sb[] = "fsl,mpc8544-immr\0simple-bus";
char *soc;
@@ -484,6 +481,9 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms,
if (kvmppc_get_hasidle(env)) {
qemu_fdt_setprop(fdt, "/hypervisor", "has-idle", NULL, 0);
}
+ } else {
+ clock_freq = pmc->clock_freq;
+ tb_freq = pmc->tb_freq;
}
/* Create CPU nodes */
@@ -564,12 +564,12 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms,
*/
if (serial_hd(1)) {
dt_serial_create(fdt, MPC8544_SERIAL1_REGS_OFFSET,
- soc, mpic, "serial1", 1, false);
+ soc, pmc->clock_freq, mpic, "serial1", 1, false);
}
if (serial_hd(0)) {
dt_serial_create(fdt, MPC8544_SERIAL0_REGS_OFFSET,
- soc, mpic, "serial0", 0, true);
+ soc, pmc->clock_freq, mpic, "serial0", 0, true);
}
/* i2c */
@@ -931,7 +931,6 @@ void ppce500_init(MachineState *machine)
CPUPPCState *firstenv = NULL;
MemoryRegion *ccsr_addr_space;
SysBusDevice *s;
- PPCE500CCSRState *ccsr;
I2CBus *i2c;
irqs = g_new0(IrqLines, smp_cpus);
@@ -968,7 +967,7 @@ void ppce500_init(MachineState *machine)
env->spr_cb[SPR_BOOKE_PIR].default_value = cs->cpu_index = i;
env->mpic_iack = pmc->ccsrbar_base + MPC8544_MPIC_REGS_OFFSET + 0xa0;
- ppc_booke_timers_init(cpu, PLATFORM_CLK_FREQ_HZ, PPC_TIMER_E500);
+ ppc_booke_timers_init(cpu, pmc->tb_freq, PPC_TIMER_E500);
/* Register reset handler */
if (!i) {
@@ -993,10 +992,10 @@ void ppce500_init(MachineState *machine)
memory_region_add_subregion(address_space_mem, 0, machine->ram);
dev = qdev_new("e500-ccsr");
+ s = SYS_BUS_DEVICE(dev);
object_property_add_child(OBJECT(machine), "e500-ccsr", OBJECT(dev));
- sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
- ccsr = CCSR(dev);
- ccsr_addr_space = &ccsr->ccsr_space;
+ sysbus_realize_and_unref(s, &error_fatal);
+ ccsr_addr_space = sysbus_mmio_get_region(s, 0);
memory_region_add_subregion(address_space_mem, pmc->ccsrbar_base,
ccsr_addr_space);
@@ -1284,6 +1283,7 @@ static void e500_ccsr_initfn(Object *obj)
PPCE500CCSRState *ccsr = CCSR(obj);
memory_region_init(&ccsr->ccsr_space, obj, "e500-ccsr",
MPC8544_CCSRBAR_SIZE);
+ sysbus_init_mmio(SYS_BUS_DEVICE(ccsr), &ccsr->ccsr_space);
}
static const TypeInfo e500_ccsr_info = {
diff --git a/hw/ppc/e500.h b/hw/ppc/e500.h
index 01db102..00f4905 100644
--- a/hw/ppc/e500.h
+++ b/hw/ppc/e500.h
@@ -5,6 +5,8 @@
#include "hw/platform-bus.h"
#include "qom/object.h"
+#define PLATFORM_CLK_FREQ_HZ (400 * 1000 * 1000)
+
struct PPCE500MachineState {
/*< private >*/
MachineState parent_obj;
@@ -37,6 +39,8 @@ struct PPCE500MachineClass {
hwaddr pci_mmio_base;
hwaddr pci_mmio_bus_base;
hwaddr spin_base;
+ uint32_t clock_freq;
+ uint32_t tb_freq;
};
void ppce500_init(MachineState *machine);
diff --git a/hw/ppc/e500plat.c b/hw/ppc/e500plat.c
index 775b9d8..4f1d659 100644
--- a/hw/ppc/e500plat.c
+++ b/hw/ppc/e500plat.c
@@ -93,6 +93,8 @@ static void e500plat_machine_class_init(ObjectClass *oc, const void *data)
pmc->pci_mmio_base = 0xC00000000ULL;
pmc->pci_mmio_bus_base = 0xE0000000ULL;
pmc->spin_base = 0xFEF000000ULL;
+ pmc->clock_freq = PLATFORM_CLK_FREQ_HZ;
+ pmc->tb_freq = PLATFORM_CLK_FREQ_HZ;
mc->desc = "generic paravirt e500 platform";
mc->init = e500plat_init;
diff --git a/hw/ppc/mpc8544ds.c b/hw/ppc/mpc8544ds.c
index 97fb0f3..5826985 100644
--- a/hw/ppc/mpc8544ds.c
+++ b/hw/ppc/mpc8544ds.c
@@ -55,6 +55,8 @@ static void mpc8544ds_machine_class_init(ObjectClass *oc, const void *data)
pmc->pci_mmio_bus_base = 0xC0000000ULL;
pmc->pci_pio_base = 0xE1000000ULL;
pmc->spin_base = 0xEF000000ULL;
+ pmc->clock_freq = PLATFORM_CLK_FREQ_HZ;
+ pmc->tb_freq = PLATFORM_CLK_FREQ_HZ;
mc->desc = "mpc8544ds";
mc->init = mpc8544ds_init;
diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c
index 7395263..982e40e 100644
--- a/hw/ppc/prep.c
+++ b/hw/ppc/prep.c
@@ -35,6 +35,7 @@
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "qemu/log.h"
+#include "qemu/datadir.h"
#include "hw/loader.h"
#include "hw/rtc/mc146818rtc.h"
#include "hw/isa/pc87312.h"
@@ -55,6 +56,8 @@
#define KERNEL_LOAD_ADDR 0x01000000
#define INITRD_LOAD_ADDR 0x01800000
+#define BIOS_ADDR 0xfff00000
+#define BIOS_SIZE (1 * MiB)
#define NVRAM_SIZE 0x2000
static void fw_cfg_boot_set(void *opaque, const char *boot_device,
@@ -241,6 +244,9 @@ static void ibm_40p_init(MachineState *machine)
ISADevice *isa_dev;
ISABus *isa_bus;
void *fw_cfg;
+ MemoryRegion *bios = g_new(MemoryRegion, 1);
+ char *filename;
+ ssize_t bios_size = -1;
uint32_t kernel_base = 0, initrd_base = 0;
long kernel_size = 0, initrd_size = 0;
char boot_device;
@@ -263,10 +269,27 @@ static void ibm_40p_init(MachineState *machine)
cpu_ppc_tb_init(env, 100UL * 1000UL * 1000UL);
qemu_register_reset(ppc_prep_reset, cpu);
+ /* allocate and load firmware */
+ filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
+ if (!filename) {
+ error_report("Could not find bios image '%s'", bios_name);
+ exit(1);
+ }
+ memory_region_init_rom(bios, NULL, "bios", BIOS_SIZE, &error_fatal);
+ memory_region_add_subregion(get_system_memory(), BIOS_ADDR, bios);
+ bios_size = load_elf(filename, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ ELFDATA2MSB, PPC_ELF_MACHINE, 0, 0);
+ if (bios_size < 0) {
+ bios_size = load_image_targphys(filename, BIOS_ADDR, BIOS_SIZE);
+ }
+ if (bios_size < 0 || bios_size > BIOS_SIZE) {
+ error_report("Could not load bios image '%s'", filename);
+ return;
+ }
+ g_free(filename);
+
/* PCI host */
dev = qdev_new("raven-pcihost");
- qdev_prop_set_string(dev, "bios-name", bios_name);
- qdev_prop_set_uint32(dev, "elf-machine", PPC_ELF_MACHINE);
pcihost = SYS_BUS_DEVICE(dev);
object_property_add_child(qdev_get_machine(), "raven", OBJECT(dev));
sysbus_realize_and_unref(pcihost, &error_fatal);
diff --git a/hw/riscv/riscv-iommu-pci.c b/hw/riscv/riscv-iommu-pci.c
index 1f44eef..cdb4a7a 100644
--- a/hw/riscv/riscv-iommu-pci.c
+++ b/hw/riscv/riscv-iommu-pci.c
@@ -68,12 +68,6 @@ typedef struct RISCVIOMMUStatePci {
RISCVIOMMUState iommu; /* common IOMMU state */
} RISCVIOMMUStatePci;
-struct RISCVIOMMUPciClass {
- /*< public >*/
- DeviceRealize parent_realize;
- ResettablePhases parent_phases;
-};
-
/* interrupt delivery callback */
static void riscv_iommu_pci_notify(RISCVIOMMUState *iommu, unsigned vector)
{
diff --git a/hw/riscv/riscv-iommu-sys.c b/hw/riscv/riscv-iommu-sys.c
index 74e76b9..e34d00a 100644
--- a/hw/riscv/riscv-iommu-sys.c
+++ b/hw/riscv/riscv-iommu-sys.c
@@ -53,12 +53,6 @@ struct RISCVIOMMUStateSys {
uint8_t *msix_pba;
};
-struct RISCVIOMMUSysClass {
- /*< public >*/
- DeviceRealize parent_realize;
- ResettablePhases parent_phases;
-};
-
static uint64_t msix_table_mmio_read(void *opaque, hwaddr addr,
unsigned size)
{
diff --git a/hw/s390x/ap-stub.c b/hw/s390x/ap-stub.c
new file mode 100644
index 0000000..001fe5f
--- /dev/null
+++ b/hw/s390x/ap-stub.c
@@ -0,0 +1,21 @@
+/*
+ * VFIO based AP matrix device assignment
+ *
+ * Copyright 2025 IBM Corp.
+ * Author(s): Rorie Reyes <rreyes@linux.ibm.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/s390x/ap-bridge.h"
+
+int ap_chsc_sei_nt0_get_event(void *res)
+{
+ return EVENT_INFORMATION_NOT_STORED;
+}
+
+bool ap_chsc_sei_nt0_have_event(void)
+{
+ return false;
+}
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
index 3bbebfd..99cbcbd 100644
--- a/hw/s390x/meson.build
+++ b/hw/s390x/meson.build
@@ -33,6 +33,7 @@ s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files(
))
s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c'))
s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c'))
+s390x_ss.add(when: 'CONFIG_VFIO_AP', if_false: files('ap-stub.c'))
virtio_ss = ss.source_set()
virtio_ss.add(files('virtio-ccw.c'))
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index f69a4d8..ce3c13d 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -1145,20 +1145,6 @@ static void ccw_machine_4_2_class_options(MachineClass *mc)
}
DEFINE_CCW_MACHINE(4, 2);
-static void ccw_machine_4_1_instance_options(MachineState *machine)
-{
- static const S390FeatInit qemu_cpu_feat = { S390_FEAT_LIST_QEMU_V4_1 };
- ccw_machine_4_2_instance_options(machine);
- s390_set_qemu_cpu_model(0x2964, 13, 2, qemu_cpu_feat);
-}
-
-static void ccw_machine_4_1_class_options(MachineClass *mc)
-{
- ccw_machine_4_2_class_options(mc);
- compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
-}
-DEFINE_CCW_MACHINE(4, 1);
-
static void ccw_machine_register_types(void)
{
type_register_static(&ccw_machine_info);
diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c
index 0fd1337..cb48cc1 100644
--- a/hw/timer/hpet.c
+++ b/hw/timer/hpet.c
@@ -328,16 +328,16 @@ static const VMStateDescription vmstate_hpet_timer = {
static const VMStateDescription vmstate_hpet = {
.name = "hpet",
.version_id = 2,
- .minimum_version_id = 1,
+ .minimum_version_id = 2,
.pre_save = hpet_pre_save,
.post_load = hpet_post_load,
.fields = (const VMStateField[]) {
VMSTATE_UINT64(config, HPETState),
VMSTATE_UINT64(isr, HPETState),
VMSTATE_UINT64(hpet_counter, HPETState),
- VMSTATE_UINT8_V(num_timers_save, HPETState, 2),
+ VMSTATE_UINT8(num_timers_save, HPETState),
VMSTATE_VALIDATE("num_timers must match", hpet_validate_num_timers),
- VMSTATE_STRUCT_VARRAY_UINT8(timer, HPETState, num_timers, 0,
+ VMSTATE_STRUCT_VARRAY_UINT8(timer, HPETState, num_timers_save, 0,
vmstate_hpet_timer, HPETTimer),
VMSTATE_END_OF_LIST()
},
@@ -691,8 +691,14 @@ static void hpet_realize(DeviceState *dev, Error **errp)
int i;
HPETTimer *timer;
+ if (s->num_timers < HPET_MIN_TIMERS || s->num_timers > HPET_MAX_TIMERS) {
+ error_setg(errp, "hpet.num_timers must be between %d and %d",
+ HPET_MIN_TIMERS, HPET_MAX_TIMERS);
+ return;
+ }
if (!s->intcap) {
- warn_report("Hpet's intcap not initialized");
+ error_setg(errp, "hpet.hpet-intcap not initialized");
+ return;
}
if (hpet_fw_cfg.count == UINT8_MAX) {
/* first instance */
@@ -700,7 +706,7 @@ static void hpet_realize(DeviceState *dev, Error **errp)
}
if (hpet_fw_cfg.count == 8) {
- error_setg(errp, "Only 8 instances of HPET is allowed");
+ error_setg(errp, "Only 8 instances of HPET are allowed");
return;
}
@@ -710,11 +716,6 @@ static void hpet_realize(DeviceState *dev, Error **errp)
sysbus_init_irq(sbd, &s->irqs[i]);
}
- if (s->num_timers < HPET_MIN_TIMERS) {
- s->num_timers = HPET_MIN_TIMERS;
- } else if (s->num_timers > HPET_MAX_TIMERS) {
- s->num_timers = HPET_MAX_TIMERS;
- }
for (i = 0; i < HPET_MAX_TIMERS; i++) {
timer = &s->timer[i];
timer->qemu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, hpet_timer, timer);
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 785c0a0..874e0d1 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -10,6 +10,7 @@
* directory.
*/
+#include <stdbool.h>
#include "qemu/osdep.h"
#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <linux/vfio.h>
@@ -18,8 +19,10 @@
#include "hw/vfio/vfio-device.h"
#include "system/iommufd.h"
#include "hw/s390x/ap-device.h"
+#include "hw/s390x/css.h"
#include "qemu/error-report.h"
#include "qemu/event_notifier.h"
+#include "qemu/lockable.h"
#include "qemu/main-loop.h"
#include "qemu/module.h"
#include "qemu/option.h"
@@ -37,8 +40,18 @@ struct VFIOAPDevice {
APDevice apdev;
VFIODevice vdev;
EventNotifier req_notifier;
+ EventNotifier cfg_notifier;
};
+typedef struct APConfigChgEvent {
+ QTAILQ_ENTRY(APConfigChgEvent) next;
+} APConfigChgEvent;
+
+static QTAILQ_HEAD(, APConfigChgEvent) cfg_chg_events =
+ QTAILQ_HEAD_INITIALIZER(cfg_chg_events);
+
+static QemuMutex cfg_chg_events_lock;
+
OBJECT_DECLARE_SIMPLE_TYPE(VFIOAPDevice, VFIO_AP_DEVICE)
static void vfio_ap_compute_needs_reset(VFIODevice *vdev)
@@ -70,6 +83,57 @@ static void vfio_ap_req_notifier_handler(void *opaque)
}
}
+static void vfio_ap_cfg_chg_notifier_handler(void *opaque)
+{
+ APConfigChgEvent *cfg_chg_event;
+ VFIOAPDevice *vapdev = opaque;
+
+ if (!event_notifier_test_and_clear(&vapdev->cfg_notifier)) {
+ return;
+ }
+
+ cfg_chg_event = g_new0(APConfigChgEvent, 1);
+
+ WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) {
+ QTAILQ_INSERT_TAIL(&cfg_chg_events, cfg_chg_event, next);
+ }
+
+ css_generate_css_crws(0);
+
+}
+
+int ap_chsc_sei_nt0_get_event(void *res)
+{
+ ChscSeiNt0Res *nt0_res = (ChscSeiNt0Res *)res;
+ APConfigChgEvent *cfg_chg_event;
+
+ WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) {
+ if (QTAILQ_EMPTY(&cfg_chg_events)) {
+ return EVENT_INFORMATION_NOT_STORED;
+ }
+
+ cfg_chg_event = QTAILQ_FIRST(&cfg_chg_events);
+ QTAILQ_REMOVE(&cfg_chg_events, cfg_chg_event, next);
+ }
+
+ memset(nt0_res, 0, sizeof(*nt0_res));
+ g_free(cfg_chg_event);
+ nt0_res->flags |= PENDING_EVENT_INFO_BITMASK;
+ nt0_res->length = sizeof(ChscSeiNt0Res);
+ nt0_res->code = NT0_RES_RESPONSE_CODE;
+ nt0_res->nt = NT0_RES_NT_DEFAULT;
+ nt0_res->rs = NT0_RES_RS_AP_CHANGE;
+ nt0_res->cc = NT0_RES_CC_AP_CHANGE;
+
+ return EVENT_INFORMATION_STORED;
+}
+
+bool ap_chsc_sei_nt0_have_event(void)
+{
+ QEMU_LOCK_GUARD(&cfg_chg_events_lock);
+ return !QTAILQ_EMPTY(&cfg_chg_events);
+}
+
static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev,
unsigned int irq, Error **errp)
{
@@ -85,6 +149,10 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev,
notifier = &vapdev->req_notifier;
fd_read = vfio_ap_req_notifier_handler;
break;
+ case VFIO_AP_CFG_CHG_IRQ_INDEX:
+ notifier = &vapdev->cfg_notifier;
+ fd_read = vfio_ap_cfg_chg_notifier_handler;
+ break;
default:
error_setg(errp, "vfio: Unsupported device irq(%d)", irq);
return false;
@@ -137,6 +205,9 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev,
case VFIO_AP_REQ_IRQ_INDEX:
notifier = &vapdev->req_notifier;
break;
+ case VFIO_AP_CFG_CHG_IRQ_INDEX:
+ notifier = &vapdev->cfg_notifier;
+ break;
default:
error_report("vfio: Unsupported device irq(%d)", irq);
return;
@@ -159,6 +230,13 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
VFIODevice *vbasedev = &vapdev->vdev;
+ static bool lock_initialized;
+
+ if (!lock_initialized) {
+ qemu_mutex_init(&cfg_chg_events_lock);
+ lock_initialized = true;
+ }
+
if (!vfio_device_get_name(vbasedev, errp)) {
return;
}
@@ -176,6 +254,15 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
warn_report_err(err);
}
+ if (!vfio_ap_register_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX, &err))
+ {
+ /*
+ * Report this error, but do not make it a failing condition.
+ * Lack of this IRQ in the host does not prevent normal operation.
+ */
+ warn_report_err(err);
+ }
+
return;
error:
@@ -188,6 +275,7 @@ static void vfio_ap_unrealize(DeviceState *dev)
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX);
+ vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX);
vfio_device_detach(&vapdev->vdev);
g_free(vapdev->vdev.name);
}
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 1c6ca94..d834bd4 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -75,12 +75,12 @@ void vfio_address_space_insert(VFIOAddressSpace *space,
int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
- void *vaddr, bool readonly)
+ void *vaddr, bool readonly, MemoryRegion *mr)
{
VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
g_assert(vioc->dma_map);
- return vioc->dma_map(bcontainer, iova, size, vaddr, readonly);
+ return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr);
}
int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index a9f0dba..3e8d645 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -31,10 +31,11 @@
#include "system/reset.h"
#include "trace.h"
#include "qapi/error.h"
+#include "migration/cpr.h"
+#include "migration/blocker.h"
#include "pci.h"
#include "hw/vfio/vfio-container.h"
#include "vfio-helpers.h"
-#include "vfio-cpr.h"
#include "vfio-listener.h"
#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio"
@@ -135,6 +136,8 @@ static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer,
int ret;
Error *local_err = NULL;
+ g_assert(!cpr_is_incoming());
+
if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) {
if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) &&
bcontainer->dirty_pages_supported) {
@@ -207,7 +210,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
}
static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
- ram_addr_t size, void *vaddr, bool readonly)
+ ram_addr_t size, void *vaddr, bool readonly,
+ MemoryRegion *mr)
{
const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
@@ -425,7 +429,12 @@ static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
return NULL;
}
- if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
+ /*
+ * During CPR, just set the container type and skip the ioctls, as the
+ * container and group are already configured in the kernel.
+ */
+ if (!cpr_is_incoming() &&
+ !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
return NULL;
}
@@ -592,6 +601,11 @@ static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group,
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
vfio_group_add_kvm_device(group);
+ /*
+ * Remember the container fd for each group, so we can attach to the same
+ * container after CPR.
+ */
+ cpr_resave_fd("vfio_container_for_group", group->groupid, container->fd);
return true;
}
@@ -601,6 +615,7 @@ static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group)
group->container = NULL;
vfio_group_del_kvm_device(group);
vfio_ram_block_discard_disable(container, false);
+ cpr_delete_fd("vfio_container_for_group", group->groupid);
}
static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
@@ -615,17 +630,34 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
bool group_was_added = false;
space = vfio_address_space_get(as);
+ fd = cpr_find_fd("vfio_container_for_group", group->groupid);
- QLIST_FOREACH(bcontainer, &space->containers, next) {
- container = container_of(bcontainer, VFIOContainer, bcontainer);
- if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
- return vfio_container_group_add(container, group, errp);
+ if (!cpr_is_incoming()) {
+ QLIST_FOREACH(bcontainer, &space->containers, next) {
+ container = container_of(bcontainer, VFIOContainer, bcontainer);
+ if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
+ return vfio_container_group_add(container, group, errp);
+ }
}
- }
- fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
- if (fd < 0) {
- goto fail;
+ fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
+ if (fd < 0) {
+ goto fail;
+ }
+ } else {
+ /*
+ * For incoming CPR, the group is already attached in the kernel.
+ * If a container with matching fd is found, then update the
+ * userland group list and return. If not, then after the loop,
+ * create the container struct and group list.
+ */
+ QLIST_FOREACH(bcontainer, &space->containers, next) {
+ container = container_of(bcontainer, VFIOContainer, bcontainer);
+
+ if (vfio_cpr_container_match(container, group, fd)) {
+ return vfio_container_group_add(container, group, errp);
+ }
+ }
}
ret = ioctl(fd, VFIO_GET_API_VERSION);
@@ -642,7 +674,7 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
new_container = true;
bcontainer = &container->bcontainer;
- if (!vfio_cpr_register_container(bcontainer, errp)) {
+ if (!vfio_legacy_cpr_register_container(container, errp)) {
goto fail;
}
@@ -660,8 +692,17 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
}
group_was_added = true;
- if (!vfio_listener_register(bcontainer, errp)) {
- goto fail;
+ /*
+ * If CPR, register the listener later, after all state that may
+ * affect regions and mapping boundaries has been cpr load'ed. Later,
+ * the listener will invoke its callback on each flat section and call
+ * dma_map to supply the new vaddr, and the calls will match the mappings
+ * remembered by the kernel.
+ */
+ if (!cpr_is_incoming()) {
+ if (!vfio_listener_register(bcontainer, errp)) {
+ goto fail;
+ }
}
bcontainer->initialized = true;
@@ -678,7 +719,7 @@ fail:
vioc->release(bcontainer);
}
if (new_container) {
- vfio_cpr_unregister_container(bcontainer);
+ vfio_legacy_cpr_unregister_container(container);
object_unref(container);
}
if (fd >= 0) {
@@ -697,6 +738,7 @@ static void vfio_container_disconnect(VFIOGroup *group)
QLIST_REMOVE(group, container_next);
group->container = NULL;
+ cpr_delete_fd("vfio_container_for_group", group->groupid);
/*
* Explicitly release the listener first before unset container,
@@ -719,7 +761,7 @@ static void vfio_container_disconnect(VFIOGroup *group)
VFIOAddressSpace *space = bcontainer->space;
trace_vfio_container_disconnect(container->fd);
- vfio_cpr_unregister_container(bcontainer);
+ vfio_legacy_cpr_unregister_container(container);
close(container->fd);
object_unref(container);
@@ -750,7 +792,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp)
group = g_malloc0(sizeof(*group));
snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
- group->fd = qemu_open(path, O_RDWR, errp);
+ group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, errp);
if (group->fd < 0) {
goto free_group_exit;
}
@@ -782,6 +824,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp)
return group;
close_fd_exit:
+ cpr_delete_fd("vfio_group", groupid);
close(group->fd);
free_group_exit:
@@ -803,6 +846,7 @@ static void vfio_group_put(VFIOGroup *group)
vfio_container_disconnect(group);
QLIST_REMOVE(group, next);
trace_vfio_group_put(group->fd);
+ cpr_delete_fd("vfio_group", group->groupid);
close(group->fd);
g_free(group);
}
@@ -813,7 +857,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
g_autofree struct vfio_device_info *info = NULL;
int fd;
- fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+ fd = vfio_cpr_group_get_device_fd(group->fd, name);
if (fd < 0) {
error_setg_errno(errp, errno, "error getting device from group %d",
group->groupid);
@@ -826,8 +870,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
info = vfio_get_device_info(fd);
if (!info) {
error_setg_errno(errp, errno, "error getting device info");
- close(fd);
- return false;
+ goto fail;
}
/*
@@ -841,8 +884,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
if (!QLIST_EMPTY(&group->device_list)) {
error_setg(errp, "Inconsistent setting of support for discarding "
"RAM (e.g., balloon) within group");
- close(fd);
- return false;
+ goto fail;
}
if (!group->ram_block_discard_allowed) {
@@ -860,6 +902,11 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs);
return true;
+
+fail:
+ close(fd);
+ cpr_delete_fd(name, 0);
+ return false;
}
static void vfio_device_put(VFIODevice *vbasedev)
@@ -870,6 +917,7 @@ static void vfio_device_put(VFIODevice *vbasedev)
QLIST_REMOVE(vbasedev, next);
vbasedev->group = NULL;
trace_vfio_device_put(vbasedev->fd);
+ cpr_delete_fd(vbasedev->name, 0);
close(vbasedev->fd);
}
@@ -939,6 +987,13 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
goto device_put_exit;
}
+ if (vbasedev->mdev) {
+ error_setg(&vbasedev->cpr.mdev_blocker,
+ "CPR does not support vfio mdev %s", vbasedev->name);
+ migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, &error_fatal,
+ MIG_MODE_CPR_TRANSFER, -1);
+ }
+
return true;
device_put_exit:
@@ -956,6 +1011,7 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev)
vfio_device_unprepare(vbasedev);
+ migrate_del_blocker(&vbasedev->cpr.mdev_blocker);
object_unref(vbasedev->hiod);
vfio_device_put(vbasedev);
vfio_group_put(group);
diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c
new file mode 100644
index 0000000..a84c324
--- /dev/null
+++ b/hw/vfio/cpr-legacy.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2021-2025 Oracle and/or its affiliates.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+#include "qemu/osdep.h"
+#include "hw/vfio/vfio-container.h"
+#include "hw/vfio/vfio-device.h"
+#include "hw/vfio/vfio-listener.h"
+#include "migration/blocker.h"
+#include "migration/cpr.h"
+#include "migration/migration.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+
+static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp)
+{
+ struct vfio_iommu_type1_dma_unmap unmap = {
+ .argsz = sizeof(unmap),
+ .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL,
+ .iova = 0,
+ .size = 0,
+ };
+ if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+ error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all");
+ return false;
+ }
+ container->cpr.vaddr_unmapped = true;
+ return true;
+}
+
+/*
+ * Set the new @vaddr for any mappings registered during cpr load.
+ * The incoming state is cleared thereafter.
+ */
+static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size, void *vaddr,
+ bool readonly, MemoryRegion *mr)
+{
+ const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
+ struct vfio_iommu_type1_dma_map map = {
+ .argsz = sizeof(map),
+ .flags = VFIO_DMA_MAP_FLAG_VADDR,
+ .vaddr = (__u64)(uintptr_t)vaddr,
+ .iova = iova,
+ .size = size,
+ };
+
+ g_assert(cpr_is_incoming());
+
+ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
+ return -errno;
+ }
+
+ return 0;
+}
+
+static void vfio_region_remap(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ VFIOContainer *container = container_of(listener, VFIOContainer,
+ cpr.remap_listener);
+ vfio_container_region_add(&container->bcontainer, section, true);
+}
+
+static bool vfio_cpr_supported(VFIOContainer *container, Error **errp)
+{
+ if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) {
+ error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR");
+ return false;
+
+ } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) {
+ error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL");
+ return false;
+
+ } else {
+ return true;
+ }
+}
+
+static int vfio_container_pre_save(void *opaque)
+{
+ VFIOContainer *container = opaque;
+ Error *local_err = NULL;
+
+ if (!vfio_dma_unmap_vaddr_all(container, &local_err)) {
+ error_report_err(local_err);
+ return -1;
+ }
+ return 0;
+}
+
+static int vfio_container_post_load(void *opaque, int version_id)
+{
+ VFIOContainer *container = opaque;
+ VFIOContainerBase *bcontainer = &container->bcontainer;
+ VFIOGroup *group;
+ Error *local_err = NULL;
+
+ if (!vfio_listener_register(bcontainer, &local_err)) {
+ error_report_err(local_err);
+ return -1;
+ }
+
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
+
+ /* Restore original dma_map function */
+ vioc->dma_map = container->cpr.saved_dma_map;
+ }
+ return 0;
+}
+
+static const VMStateDescription vfio_container_vmstate = {
+ .name = "vfio-container",
+ .version_id = 0,
+ .minimum_version_id = 0,
+ .priority = MIG_PRI_LOW, /* Must happen after devices and groups */
+ .pre_save = vfio_container_pre_save,
+ .post_load = vfio_container_post_load,
+ .needed = cpr_incoming_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier,
+ MigrationEvent *e, Error **errp)
+{
+ VFIOContainer *container =
+ container_of(notifier, VFIOContainer, cpr.transfer_notifier);
+ VFIOContainerBase *bcontainer = &container->bcontainer;
+
+ if (e->type != MIG_EVENT_PRECOPY_FAILED) {
+ return 0;
+ }
+
+ if (container->cpr.vaddr_unmapped) {
+ /*
+ * Force a call to vfio_region_remap for each mapped section by
+ * temporarily registering a listener, and temporarily diverting
+ * dma_map to vfio_legacy_cpr_dma_map. The latter restores vaddr.
+ */
+
+ VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
+ vioc->dma_map = vfio_legacy_cpr_dma_map;
+
+ container->cpr.remap_listener = (MemoryListener) {
+ .name = "vfio cpr recover",
+ .region_add = vfio_region_remap
+ };
+ memory_listener_register(&container->cpr.remap_listener,
+ bcontainer->space->as);
+ memory_listener_unregister(&container->cpr.remap_listener);
+ container->cpr.vaddr_unmapped = false;
+ vioc->dma_map = container->cpr.saved_dma_map;
+ }
+ return 0;
+}
+
+bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp)
+{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
+ Error **cpr_blocker = &container->cpr.blocker;
+
+ migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier,
+ vfio_cpr_reboot_notifier,
+ MIG_MODE_CPR_REBOOT);
+
+ if (!vfio_cpr_supported(container, cpr_blocker)) {
+ return migrate_add_blocker_modes(cpr_blocker, errp,
+ MIG_MODE_CPR_TRANSFER, -1) == 0;
+ }
+
+ vmstate_register(NULL, -1, &vfio_container_vmstate, container);
+
+ /* During incoming CPR, divert calls to dma_map. */
+ if (cpr_is_incoming()) {
+ VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
+ container->cpr.saved_dma_map = vioc->dma_map;
+ vioc->dma_map = vfio_legacy_cpr_dma_map;
+ }
+
+ migration_add_notifier_mode(&container->cpr.transfer_notifier,
+ vfio_cpr_fail_notifier,
+ MIG_MODE_CPR_TRANSFER);
+ return true;
+}
+
+void vfio_legacy_cpr_unregister_container(VFIOContainer *container)
+{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
+
+ migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
+ migrate_del_blocker(&container->cpr.blocker);
+ vmstate_unregister(NULL, &vfio_container_vmstate, container);
+ migration_remove_notifier(&container->cpr.transfer_notifier);
+}
+
+/*
+ * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after
+ * succeeding for others, so the latter have lost their vaddr. Call this
+ * to restore vaddr for a section with a giommu.
+ *
+ * The giommu already exists. Find it and replay it, which calls
+ * vfio_legacy_cpr_dma_map further down the stack.
+ */
+void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section)
+{
+ VFIOGuestIOMMU *giommu = NULL;
+ hwaddr as_offset = section->offset_within_address_space;
+ hwaddr iommu_offset = as_offset - section->offset_within_region;
+
+ QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
+ if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) &&
+ giommu->iommu_offset == iommu_offset) {
+ break;
+ }
+ }
+ g_assert(giommu);
+ memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
+}
+
+/*
+ * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after
+ * succeeding for others, so the latter have lost their vaddr. Call this
+ * to restore vaddr for a section with a RamDiscardManager.
+ *
+ * The ram discard listener already exists. Call its populate function
+ * directly, which calls vfio_legacy_cpr_dma_map.
+ */
+bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section)
+{
+ VFIORamDiscardListener *vrdl =
+ vfio_find_ram_discard_listener(bcontainer, section);
+
+ g_assert(vrdl);
+ return vrdl->listener.notify_populate(&vrdl->listener, section) == 0;
+}
+
+int vfio_cpr_group_get_device_fd(int d, const char *name)
+{
+ const int id = 0;
+ int fd = cpr_find_fd(name, id);
+
+ if (fd < 0) {
+ fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name);
+ if (fd >= 0) {
+ cpr_save_fd(name, id, fd);
+ }
+ }
+ return fd;
+}
+
+static bool same_device(int fd1, int fd2)
+{
+ struct stat st1, st2;
+
+ return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev;
+}
+
+bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group,
+ int fd)
+{
+ if (container->fd == fd) {
+ return true;
+ }
+ if (!same_device(container->fd, fd)) {
+ return false;
+ }
+ /*
+ * Same device, different fd. This occurs when the container fd is
+ * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS
+ * produces duplicates. De-dup it.
+ */
+ cpr_delete_fd("vfio_container_for_group", group->groupid);
+ close(fd);
+ cpr_save_fd("vfio_container_for_group", group->groupid, container->fd);
+ return true;
+}
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index 3214184..fdbb58e 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -7,13 +7,14 @@
#include "qemu/osdep.h"
#include "hw/vfio/vfio-device.h"
-#include "migration/misc.h"
+#include "hw/vfio/vfio-cpr.h"
+#include "hw/vfio/pci.h"
+#include "migration/cpr.h"
#include "qapi/error.h"
#include "system/runstate.h"
-#include "vfio-cpr.h"
-static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier,
- MigrationEvent *e, Error **errp)
+int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier,
+ MigrationEvent *e, Error **errp)
{
if (e->type == MIG_EVENT_PRECOPY_SETUP &&
!runstate_check(RUN_STATE_SUSPENDED) && !vm_get_suspended()) {
@@ -38,3 +39,32 @@ void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer)
{
migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
}
+
+/*
+ * The kernel may change non-emulated config bits. Exclude them from the
+ * changed-bits check in get_pci_config_device.
+ */
+static int vfio_cpr_pci_pre_load(void *opaque)
+{
+ VFIOPCIDevice *vdev = opaque;
+ PCIDevice *pdev = &vdev->pdev;
+ int size = MIN(pci_config_size(pdev), vdev->config_size);
+ int i;
+
+ for (i = 0; i < size; i++) {
+ pdev->cmask[i] &= vdev->emulated_config_bits[i];
+ }
+
+ return 0;
+}
+
+const VMStateDescription vfio_cpr_pci_vmstate = {
+ .name = "vfio-cpr-pci",
+ .version_id = 0,
+ .minimum_version_id = 0,
+ .pre_load = vfio_cpr_pci_pre_load,
+ .needed = cpr_incoming_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_END_OF_LIST()
+ }
+};
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index 9fba2c7..d32600e 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -200,6 +200,7 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
struct vfio_region_info **info)
{
size_t argsz = sizeof(struct vfio_region_info);
+ int fd = -1;
int ret;
/* check cache */
@@ -214,7 +215,7 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
retry:
(*info)->argsz = argsz;
- ret = vbasedev->io_ops->get_region_info(vbasedev, *info);
+ ret = vbasedev->io_ops->get_region_info(vbasedev, *info, &fd);
if (ret != 0) {
g_free(*info);
*info = NULL;
@@ -225,11 +226,19 @@ retry:
argsz = (*info)->argsz;
*info = g_realloc(*info, argsz);
+ if (fd != -1) {
+ close(fd);
+ fd = -1;
+ }
+
goto retry;
}
/* fill cache */
vbasedev->reginfo[index] = *info;
+ if (vbasedev->region_fds != NULL) {
+ vbasedev->region_fds[index] = fd;
+ }
return 0;
}
@@ -334,6 +343,7 @@ void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
vbasedev->io_ops = &vfio_device_io_ops_ioctl;
vbasedev->dev = dev;
vbasedev->fd = -1;
+ vbasedev->use_region_fds = false;
vbasedev->ram_block_discard_allowed = ram_discard;
}
@@ -444,6 +454,9 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
vbasedev->reginfo = g_new0(struct vfio_region_info *,
vbasedev->num_regions);
+ if (vbasedev->use_region_fds) {
+ vbasedev->region_fds = g_new0(int, vbasedev->num_regions);
+ }
}
void vfio_device_unprepare(VFIODevice *vbasedev)
@@ -452,9 +465,14 @@ void vfio_device_unprepare(VFIODevice *vbasedev)
for (i = 0; i < vbasedev->num_regions; i++) {
g_free(vbasedev->reginfo[i]);
+ if (vbasedev->region_fds != NULL && vbasedev->region_fds[i] != -1) {
+ close(vbasedev->region_fds[i]);
+ }
+
}
- g_free(vbasedev->reginfo);
- vbasedev->reginfo = NULL;
+
+ g_clear_pointer(&vbasedev->reginfo, g_free);
+ g_clear_pointer(&vbasedev->region_fds, g_free);
QLIST_REMOVE(vbasedev, container_next);
QLIST_REMOVE(vbasedev, global_next);
@@ -476,10 +494,13 @@ static int vfio_device_io_device_feature(VFIODevice *vbasedev,
}
static int vfio_device_io_get_region_info(VFIODevice *vbasedev,
- struct vfio_region_info *info)
+ struct vfio_region_info *info,
+ int *fd)
{
int ret;
+ *fd = -1;
+
ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
return ret < 0 ? -errno : ret;
@@ -522,7 +543,8 @@ static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index,
}
static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index,
- off_t off, uint32_t size, void *data)
+ off_t off, uint32_t size, void *data,
+ bool post)
{
struct vfio_region_info *info;
int ret;
diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c
index e7952d1..e7a9d1f 100644
--- a/hw/vfio/igd.c
+++ b/hw/vfio/igd.c
@@ -187,23 +187,21 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
}
static bool vfio_pci_igd_opregion_detect(VFIOPCIDevice *vdev,
- struct vfio_region_info **opregion,
- Error **errp)
+ struct vfio_region_info **opregion)
{
int ret;
- /* Hotplugging is not supported for opregion access */
- if (vdev->pdev.qdev.hotplugged) {
- error_setg(errp, "IGD OpRegion is not supported on hotplugged device");
- return false;
- }
-
ret = vfio_device_get_region_info_type(&vdev->vbasedev,
VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, opregion);
if (ret) {
- error_setg_errno(errp, -ret,
- "Device does not supports IGD OpRegion feature");
+ return false;
+ }
+
+ /* Hotplugging is not supported for opregion access */
+ if (vdev->pdev.qdev.hotplugged) {
+ warn_report("IGD device detected, but OpRegion is not supported "
+ "on hotplugged device.");
return false;
}
@@ -524,7 +522,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp)
}
/* IGD device always comes with OpRegion */
- if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) {
+ if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) {
return true;
}
info_report("OpRegion detected on Intel display %x.", vdev->device_id);
@@ -695,7 +693,7 @@ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp)
return true;
}
- if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) {
+ if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) {
/* Should never reach here, KVMGT always emulates OpRegion */
return false;
}
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index af1c7ab..d3efef7 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -21,20 +21,21 @@
#include "qapi/error.h"
#include "system/iommufd.h"
#include "hw/qdev-core.h"
+#include "hw/vfio/vfio-cpr.h"
#include "system/reset.h"
#include "qemu/cutils.h"
#include "qemu/chardev_open.h"
#include "pci.h"
#include "vfio-iommufd.h"
#include "vfio-helpers.h"
-#include "vfio-cpr.h"
#include "vfio-listener.h"
#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \
TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio"
static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
- ram_addr_t size, void *vaddr, bool readonly)
+ ram_addr_t size, void *vaddr, bool readonly,
+ MemoryRegion *mr)
{
const VFIOIOMMUFDContainer *container =
container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
@@ -592,6 +593,10 @@ found_container:
goto err_listener_register;
}
+ /*
+ * Do not move this code before attachment! The nested IOMMU support
+ * needs device and hwpt id which are generated only after attachment.
+ */
if (!vfio_device_hiod_create_and_realize(vbasedev,
TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, errp)) {
goto err_listener_register;
@@ -810,21 +815,38 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data)
vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap;
};
+static bool
+host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+ uint32_t hwpt_id, Error **errp)
+{
+ VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent;
+
+ return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp);
+}
+
+static bool
+host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+ Error **errp)
+{
+ VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent;
+
+ return iommufd_cdev_detach_ioas_hwpt(vbasedev, errp);
+}
+
static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
Error **errp)
{
VFIODevice *vdev = opaque;
+ HostIOMMUDeviceIOMMUFD *idev;
HostIOMMUDeviceCaps *caps = &hiod->caps;
+ VendorCaps *vendor_caps = &caps->vendor_caps;
enum iommu_hw_info_type type;
- union {
- struct iommu_hw_info_vtd vtd;
- } data;
uint64_t hw_caps;
hiod->agent = opaque;
- if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid,
- &type, &data, sizeof(data),
+ if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type,
+ vendor_caps, sizeof(*vendor_caps),
&hw_caps, errp)) {
return false;
}
@@ -833,6 +855,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
caps->type = type;
caps->hw_caps = hw_caps;
+ idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
+ idev->iommufd = vdev->iommufd;
+ idev->devid = vdev->devid;
+ idev->hwpt_id = vdev->hwpt->hwpt_id;
+
return true;
}
@@ -858,10 +885,14 @@ hiod_iommufd_vfio_get_page_size_mask(HostIOMMUDevice *hiod)
static void hiod_iommufd_vfio_class_init(ObjectClass *oc, const void *data)
{
HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
+ HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc);
hiodc->realize = hiod_iommufd_vfio_realize;
hiodc->get_iova_ranges = hiod_iommufd_vfio_get_iova_ranges;
hiodc->get_page_size_mask = hiod_iommufd_vfio_get_page_size_mask;
+
+ idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt;
+ idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt;
};
static const TypeInfo types[] = {
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index bfacb3d..f498e23 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -90,16 +90,17 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
section->offset_within_address_space & (1ULL << 63);
}
-/* Called with rcu_read_lock held. */
-static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
- ram_addr_t *ram_addr, bool *read_only,
- Error **errp)
+/*
+ * Called with rcu_read_lock held.
+ * The returned MemoryRegion must not be accessed after calling rcu_read_unlock.
+ */
+static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p,
+ Error **errp)
{
- bool ret, mr_has_discard_manager;
+ MemoryRegion *mr;
- ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
- &mr_has_discard_manager, errp);
- if (ret && mr_has_discard_manager) {
+ mr = memory_translate_iotlb(iotlb, xlat_p, errp);
+ if (mr && memory_region_has_ram_discard_manager(mr)) {
/*
* Malicious VMs might trigger discarding of IOMMU-mapped memory. The
* pages will remain pinned inside vfio until unmapped, resulting in a
@@ -118,7 +119,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
" intended via an IOMMU. It's possible to mitigate "
" by setting/adjusting RLIMIT_MEMLOCK.");
}
- return ret;
+ return mr;
}
static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
@@ -126,6 +127,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
VFIOContainerBase *bcontainer = giommu->bcontainer;
hwaddr iova = iotlb->iova + giommu->iommu_offset;
+ MemoryRegion *mr;
+ hwaddr xlat;
void *vaddr;
int ret;
Error *local_err = NULL;
@@ -150,10 +153,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
bool read_only;
- if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) {
+ mr = vfio_translate_iotlb(iotlb, &xlat, &local_err);
+ if (!mr) {
error_report_err(local_err);
goto out;
}
+ vaddr = memory_region_get_ram_ptr(mr) + xlat;
+ read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly;
+
/*
* vaddr is only valid until rcu_read_unlock(). But after
* vfio_dma_map has set up the mapping the pages will be
@@ -163,7 +170,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
*/
ret = vfio_container_dma_map(bcontainer, iova,
iotlb->addr_mask + 1, vaddr,
- read_only);
+ read_only, mr);
if (ret) {
error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
@@ -233,7 +240,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
vaddr = memory_region_get_ram_ptr(section->mr) + start;
ret = vfio_container_dma_map(bcontainer, iova, next - start,
- vaddr, section->readonly);
+ vaddr, section->readonly, section->mr);
if (ret) {
/* Rollback */
vfio_ram_discard_notify_discard(rdl, section);
@@ -430,7 +437,7 @@ static void vfio_listener_commit(MemoryListener *listener)
listener);
void (*listener_commit)(VFIOContainerBase *bcontainer);
- listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
+ listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_commit;
if (listener_commit) {
listener_commit(bcontainer);
@@ -449,11 +456,38 @@ static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp)
}
}
+VFIORamDiscardListener *vfio_find_ram_discard_listener(
+ VFIOContainerBase *bcontainer, MemoryRegionSection *section)
+{
+ VFIORamDiscardListener *vrdl = NULL;
+
+ QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
+ if (vrdl->mr == section->mr &&
+ vrdl->offset_within_address_space ==
+ section->offset_within_address_space) {
+ break;
+ }
+ }
+
+ if (!vrdl) {
+ hw_error("vfio: Trying to sync missing RAM discard listener");
+ /* does not return */
+ }
+ return vrdl;
+}
+
static void vfio_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
listener);
+ vfio_container_region_add(bcontainer, section, false);
+}
+
+void vfio_container_region_add(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section,
+ bool cpr_remap)
+{
hwaddr iova, end;
Int128 llend, llsize;
void *vaddr;
@@ -489,6 +523,11 @@ static void vfio_listener_region_add(MemoryListener *listener,
int iommu_idx;
trace_vfio_listener_region_add_iommu(section->mr->name, iova, end);
+
+ if (cpr_remap) {
+ vfio_cpr_giommu_remap(bcontainer, section);
+ }
+
/*
* FIXME: For VFIO iommu types which have KVM acceleration to
* avoid bouncing all map/unmaps through qemu this way, this
@@ -531,7 +570,12 @@ static void vfio_listener_region_add(MemoryListener *listener,
* about changes.
*/
if (memory_region_has_ram_discard_manager(section->mr)) {
- vfio_ram_discard_register_listener(bcontainer, section);
+ if (!cpr_remap) {
+ vfio_ram_discard_register_listener(bcontainer, section);
+ } else if (!vfio_cpr_ram_discard_register_listener(bcontainer,
+ section)) {
+ goto fail;
+ }
return;
}
@@ -557,7 +601,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
}
ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
- vaddr, section->readonly);
+ vaddr, section->readonly, section->mr);
if (ret) {
error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
@@ -1010,6 +1054,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
ram_addr_t translated_addr;
Error *local_err = NULL;
int ret = -EINVAL;
+ MemoryRegion *mr;
+ hwaddr xlat;
trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
@@ -1021,9 +1067,11 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
}
rcu_read_lock();
- if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
+ mr = vfio_translate_iotlb(iotlb, &xlat, &local_err);
+ if (!mr) {
goto out_unlock;
}
+ translated_addr = memory_region_get_ram_addr(mr) + xlat;
ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
translated_addr, &local_err);
@@ -1075,19 +1123,8 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
- VFIORamDiscardListener *vrdl = NULL;
-
- QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
- if (vrdl->mr == section->mr &&
- vrdl->offset_within_address_space ==
- section->offset_within_address_space) {
- break;
- }
- }
-
- if (!vrdl) {
- hw_error("vfio: Trying to sync missing RAM discard listener");
- }
+ VFIORamDiscardListener *vrdl =
+ vfio_find_ram_discard_listener(bcontainer, section);
/*
* We only want/can synchronize the bitmap for actually mapped parts -
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index bccb050..73d29f9 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -21,6 +21,7 @@ system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c'))
system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c'))
system_ss.add(when: 'CONFIG_VFIO', if_true: files(
'cpr.c',
+ 'cpr-legacy.c',
'device.c',
'migration.c',
'migration-multifd.c',
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a1bfdfe..fa25bde 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -30,6 +30,7 @@
#include "hw/qdev-properties.h"
#include "hw/qdev-properties-system.h"
#include "migration/vmstate.h"
+#include "migration/cpr.h"
#include "qobject/qdict.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
@@ -56,6 +57,23 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
+static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e,
+ const char *name, int nr, Error **errp)
+{
+ int ret = event_notifier_init(e, 0);
+
+ if (ret) {
+ error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name);
+ }
+ return !ret;
+}
+
+static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e,
+ const char *name, int nr)
+{
+ event_notifier_cleanup(e);
+}
+
/*
* Disabling BAR mmaping can be slow, but toggling it around INTx can
* also be a huge overhead. We try to get the best of both worlds by
@@ -103,7 +121,7 @@ static void vfio_intx_interrupt(void *opaque)
}
}
-static void vfio_intx_eoi(VFIODevice *vbasedev)
+void vfio_pci_intx_eoi(VFIODevice *vbasedev)
{
VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
@@ -111,7 +129,7 @@ static void vfio_intx_eoi(VFIODevice *vbasedev)
return;
}
- trace_vfio_intx_eoi(vbasedev->name);
+ trace_vfio_pci_intx_eoi(vbasedev->name);
vdev->intx.pending = false;
pci_irq_deassert(&vdev->pdev);
@@ -136,8 +154,7 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
pci_irq_deassert(&vdev->pdev);
/* Get an eventfd for resample/unmask */
- if (event_notifier_init(&vdev->intx.unmask, 0)) {
- error_setg(errp, "event_notifier_init failed eoi");
+ if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) {
goto fail;
}
@@ -169,7 +186,7 @@ fail_vfio:
kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
vdev->intx.route.irq);
fail_irqfd:
- event_notifier_cleanup(&vdev->intx.unmask);
+ vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
fail:
qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
@@ -201,7 +218,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
}
/* We only need to close the eventfd for VFIO to cleanup the kernel side */
- event_notifier_cleanup(&vdev->intx.unmask);
+ vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
/* QEMU starts listening for interrupt events. */
qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
@@ -236,7 +253,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
}
/* Re-enable the interrupt in cased we missed an EOI */
- vfio_intx_eoi(&vdev->vbasedev);
+ vfio_pci_intx_eoi(&vdev->vbasedev);
}
static void vfio_intx_routing_notifier(PCIDevice *pdev)
@@ -268,7 +285,6 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
Error *err = NULL;
int32_t fd;
- int ret;
if (!pin) {
@@ -291,9 +307,8 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
}
#endif
- ret = event_notifier_init(&vdev->intx.interrupt, 0);
- if (ret) {
- error_setg_errno(errp, -ret, "event_notifier_init failed");
+ if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0,
+ errp)) {
return false;
}
fd = event_notifier_get_fd(&vdev->intx.interrupt);
@@ -302,7 +317,7 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
qemu_set_fd_handler(fd, NULL, NULL, vdev);
- event_notifier_cleanup(&vdev->intx.interrupt);
+ vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
return false;
}
@@ -329,13 +344,18 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev)
fd = event_notifier_get_fd(&vdev->intx.interrupt);
qemu_set_fd_handler(fd, NULL, NULL, vdev);
- event_notifier_cleanup(&vdev->intx.interrupt);
+ vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
vdev->interrupt = VFIO_INT_NONE;
trace_vfio_intx_disable(vdev->vbasedev.name);
}
+bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp)
+{
+ return vfio_intx_enable(vdev, errp);
+}
+
/*
* MSI/X
*/
@@ -460,8 +480,8 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
return ret;
}
-static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
- int vector_n, bool msix)
+void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
+ int vector_n, bool msix)
{
if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
return;
@@ -471,13 +491,16 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
vector_n, &vdev->pdev);
}
-static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector)
+static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr)
{
+ const char *name = "kvm_interrupt";
+
if (vector->virq < 0) {
return;
}
- if (event_notifier_init(&vector->kvm_interrupt, 0)) {
+ if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr,
+ NULL)) {
goto fail_notifier;
}
@@ -489,19 +512,20 @@ static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector)
return;
fail_kvm:
- event_notifier_cleanup(&vector->kvm_interrupt);
+ vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr);
fail_notifier:
kvm_irqchip_release_virq(kvm_state, vector->virq);
vector->virq = -1;
}
-static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
+static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
+ int nr)
{
kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
vector->virq);
kvm_irqchip_release_virq(kvm_state, vector->virq);
vector->virq = -1;
- event_notifier_cleanup(&vector->kvm_interrupt);
+ vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr);
}
static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
@@ -511,6 +535,43 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
kvm_irqchip_commit_routes(kvm_state);
}
+static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector,
+ unsigned int nr)
+{
+ Error *err = NULL;
+ int32_t fd;
+
+ if (vector->virq >= 0) {
+ fd = event_notifier_get_fd(&vector->kvm_interrupt);
+ } else {
+ fd = event_notifier_get_fd(&vector->interrupt);
+ }
+
+ if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
+ VFIO_IRQ_SET_ACTION_TRIGGER,
+ fd, &err)) {
+ error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name);
+ }
+}
+
+void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr)
+{
+ VFIOMSIVector *vector = &vdev->msi_vectors[nr];
+ PCIDevice *pdev = &vdev->pdev;
+ Error *local_err = NULL;
+
+ vector->vdev = vdev;
+ vector->virq = -1;
+ if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr,
+ &local_err)) {
+ error_report_err(local_err);
+ }
+ vector->use = true;
+ if (vdev->interrupt == VFIO_INT_MSIX) {
+ msix_vector_use(pdev, nr);
+ }
+}
+
static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
MSIMessage *msg, IOHandler *handler)
{
@@ -524,13 +585,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
vector = &vdev->msi_vectors[nr];
if (!vector->use) {
- vector->vdev = vdev;
- vector->virq = -1;
- if (event_notifier_init(&vector->interrupt, 0)) {
- error_report("vfio: Error: event_notifier_init failed");
- }
- vector->use = true;
- msix_vector_use(pdev, nr);
+ vfio_pci_vector_init(vdev, nr);
}
qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
@@ -542,19 +597,19 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
*/
if (vector->virq >= 0) {
if (!msg) {
- vfio_remove_kvm_msi_virq(vector);
+ vfio_remove_kvm_msi_virq(vdev, vector, nr);
} else {
vfio_update_kvm_msi_virq(vector, *msg, pdev);
}
} else {
if (msg) {
if (vdev->defer_kvm_irq_routing) {
- vfio_add_kvm_msi_virq(vdev, vector, nr, true);
+ vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
} else {
vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
- vfio_add_kvm_msi_virq(vdev, vector, nr, true);
+ vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
kvm_irqchip_commit_route_changes(&vfio_route_change);
- vfio_connect_kvm_msi_virq(vector);
+ vfio_connect_kvm_msi_virq(vector, nr);
}
}
}
@@ -583,21 +638,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
strerror(-ret));
}
} else {
- Error *err = NULL;
- int32_t fd;
-
- if (vector->virq >= 0) {
- fd = event_notifier_get_fd(&vector->kvm_interrupt);
- } else {
- fd = event_notifier_get_fd(&vector->interrupt);
- }
-
- if (!vfio_device_irq_set_signaling(&vdev->vbasedev,
- VFIO_PCI_MSIX_IRQ_INDEX, nr,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd,
- &err)) {
- error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
- }
+ set_irq_signalling(&vdev->vbasedev, vector, nr);
}
}
@@ -645,14 +686,14 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
}
}
-static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
+void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
{
assert(!vdev->defer_kvm_irq_routing);
vdev->defer_kvm_irq_routing = true;
vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
}
-static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
+void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
{
int i;
@@ -662,7 +703,7 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
kvm_irqchip_commit_route_changes(&vfio_route_change);
for (i = 0; i < vdev->nr_vectors; i++) {
- vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]);
+ vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i);
}
}
@@ -682,14 +723,14 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
* routes once rather than per vector provides a substantial
* performance improvement.
*/
- vfio_prepare_kvm_msi_virq_batch(vdev);
+ vfio_pci_prepare_kvm_msi_virq_batch(vdev);
if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
vfio_msix_vector_release, NULL)) {
error_report("vfio: msix_set_vector_notifiers failed");
}
- vfio_commit_kvm_msi_virq_batch(vdev);
+ vfio_pci_commit_kvm_msi_virq_batch(vdev);
if (vdev->nr_vectors) {
ret = vfio_enable_vectors(vdev, true);
@@ -733,19 +774,21 @@ retry:
* Deferring to commit the KVM routes once rather than per vector
* provides a substantial performance improvement.
*/
- vfio_prepare_kvm_msi_virq_batch(vdev);
+ vfio_pci_prepare_kvm_msi_virq_batch(vdev);
vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
for (i = 0; i < vdev->nr_vectors; i++) {
VFIOMSIVector *vector = &vdev->msi_vectors[i];
+ Error *local_err = NULL;
vector->vdev = vdev;
vector->virq = -1;
vector->use = true;
- if (event_notifier_init(&vector->interrupt, 0)) {
- error_report("vfio: Error: event_notifier_init failed");
+ if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i,
+ &local_err)) {
+ error_report_err(local_err);
}
qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
@@ -755,10 +798,10 @@ retry:
* Attempt to enable route through KVM irqchip,
* default to userspace handling if unavailable.
*/
- vfio_add_kvm_msi_virq(vdev, vector, i, false);
+ vfio_pci_add_kvm_msi_virq(vdev, vector, i, false);
}
- vfio_commit_kvm_msi_virq_batch(vdev);
+ vfio_pci_commit_kvm_msi_virq_batch(vdev);
/* Set interrupt type prior to possible interrupts */
vdev->interrupt = VFIO_INT_MSI;
@@ -801,11 +844,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
VFIOMSIVector *vector = &vdev->msi_vectors[i];
if (vdev->msi_vectors[i].use) {
if (vector->virq >= 0) {
- vfio_remove_kvm_msi_virq(vector);
+ vfio_remove_kvm_msi_virq(vdev, vector, i);
}
qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
NULL, NULL, NULL);
- event_notifier_cleanup(&vector->interrupt);
+ vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i);
}
}
@@ -984,7 +1027,7 @@ static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
{
return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev,
VFIO_PCI_CONFIG_REGION_INDEX,
- offset, size, data);
+ offset, size, data, false);
}
static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
@@ -1738,7 +1781,7 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
return true;
}
-static void vfio_teardown_msi(VFIOPCIDevice *vdev)
+void vfio_pci_teardown_msi(VFIOPCIDevice *vdev)
{
msi_uninit(&vdev->pdev);
@@ -1788,6 +1831,9 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
~PCI_BASE_ADDRESS_MEM_MASK);
bar->size = bar->region.size;
+
+ /* IO regions are sync, memory can be async */
+ bar->region.post_wr = (bar->ioport == 0);
}
static void vfio_bars_prepare(VFIOPCIDevice *vdev)
@@ -1834,7 +1880,7 @@ static void vfio_bars_register(VFIOPCIDevice *vdev)
}
}
-static void vfio_bars_exit(VFIOPCIDevice *vdev)
+void vfio_pci_bars_exit(VFIOPCIDevice *vdev)
{
int i;
@@ -2425,7 +2471,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
g_free(config);
}
-static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
{
PCIDevice *pdev = &vdev->pdev;
@@ -2701,7 +2747,7 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
static VFIODeviceOps vfio_pci_ops = {
.vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
.vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
- .vfio_eoi = vfio_intx_eoi,
+ .vfio_eoi = vfio_pci_intx_eoi,
.vfio_get_object = vfio_pci_get_object,
.vfio_save_config = vfio_pci_save_config,
.vfio_load_config = vfio_pci_load_config,
@@ -2772,7 +2818,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
return true;
}
-static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp)
{
VFIODevice *vbasedev = &vdev->vbasedev;
struct vfio_region_info *reg_info = NULL;
@@ -2818,7 +2864,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
return false;
}
- trace_vfio_populate_device_config(vdev->vbasedev.name,
+ trace_vfio_pci_populate_device_config(vdev->vbasedev.name,
(unsigned long)reg_info->size,
(unsigned long)reg_info->offset,
(unsigned long)reg_info->flags);
@@ -2840,7 +2886,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
if (ret) {
/* This can fail for an old kernel or legacy PCI dev */
- trace_vfio_populate_device_get_irq_info_failure(strerror(-ret));
+ trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret));
} else if (irq_info.count == 1) {
vdev->pci_aer = true;
} else {
@@ -2852,8 +2898,20 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
return true;
}
-static void vfio_pci_put_device(VFIOPCIDevice *vdev)
+void vfio_pci_put_device(VFIOPCIDevice *vdev)
{
+ vfio_display_finalize(vdev);
+ vfio_bars_finalize(vdev);
+ g_free(vdev->emulated_config_bits);
+ g_free(vdev->rom);
+ /*
+ * XXX Leaking igd_opregion is not an oversight, we can't remove the
+ * fw_cfg entry therefore leaking this allocation seems like the safest
+ * option.
+ *
+ * g_free(vdev->igd_opregion);
+ */
+
vfio_device_detach(&vdev->vbasedev);
g_free(vdev->vbasedev.name);
@@ -2888,7 +2946,7 @@ static void vfio_err_notifier_handler(void *opaque)
* and continue after disabling error recovery support for the
* device.
*/
-static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
+void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev)
{
Error *err = NULL;
int32_t fd;
@@ -2897,8 +2955,9 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
return;
}
- if (event_notifier_init(&vdev->err_notifier, 0)) {
- error_report("vfio: Unable to init event notifier for error detection");
+ if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0,
+ &err)) {
+ error_report_err(err);
vdev->pci_aer = false;
return;
}
@@ -2910,7 +2969,7 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
qemu_set_fd_handler(fd, NULL, NULL, vdev);
- event_notifier_cleanup(&vdev->err_notifier);
+ vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
vdev->pci_aer = false;
}
}
@@ -2929,7 +2988,7 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
}
qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
NULL, NULL, vdev);
- event_notifier_cleanup(&vdev->err_notifier);
+ vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
}
static void vfio_req_notifier_handler(void *opaque)
@@ -2947,7 +3006,7 @@ static void vfio_req_notifier_handler(void *opaque)
}
}
-static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
+void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev)
{
struct vfio_irq_info irq_info;
Error *err = NULL;
@@ -2964,8 +3023,9 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
return;
}
- if (event_notifier_init(&vdev->req_notifier, 0)) {
- error_report("vfio: Unable to init event notifier for device request");
+ if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0,
+ &err)) {
+ error_report_err(err);
return;
}
@@ -2976,7 +3036,7 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
qemu_set_fd_handler(fd, NULL, NULL, vdev);
- event_notifier_cleanup(&vdev->req_notifier);
+ vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
} else {
vdev->req_enabled = true;
}
@@ -2996,15 +3056,28 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
}
qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
NULL, NULL, vdev);
- event_notifier_cleanup(&vdev->req_notifier);
+ vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
vdev->req_enabled = false;
}
-static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
{
PCIDevice *pdev = &vdev->pdev;
VFIODevice *vbasedev = &vdev->vbasedev;
+ uint32_t config_space_size;
+ int ret;
+
+ config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size);
+
+ /* Get a copy of config space */
+ ret = vfio_pci_config_space_read(vdev, 0, config_space_size,
+ vdev->pdev.config);
+ if (ret < (int)config_space_size) {
+ ret = ret < 0 ? -ret : EFAULT;
+ error_setg_errno(errp, ret, "failed to read device config space");
+ return false;
+ }
/* vfio emulates a lot for us, but some bits need extra love */
vdev->emulated_config_bits = g_malloc0(vdev->config_size);
@@ -3094,7 +3167,7 @@ static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
return true;
}
-static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
{
PCIDevice *pdev = &vdev->pdev;
@@ -3126,15 +3199,14 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
return true;
}
-static void vfio_realize(PCIDevice *pdev, Error **errp)
+static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
{
ERRP_GUARD();
VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
VFIODevice *vbasedev = &vdev->vbasedev;
- int i, ret;
+ int i;
char uuid[UUID_STR_LEN];
g_autofree char *name = NULL;
- uint32_t config_space_size;
if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
if (!(~vdev->host.domain || ~vdev->host.bus ||
@@ -3185,18 +3257,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
goto error;
}
- if (!vfio_populate_device(vdev, errp)) {
- goto error;
- }
-
- config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size);
-
- /* Get a copy of config space */
- ret = vfio_pci_config_space_read(vdev, 0, config_space_size,
- vdev->pdev.config);
- if (ret < (int)config_space_size) {
- ret = ret < 0 ? -ret : EFAULT;
- error_setg_errno(errp, ret, "failed to read device config space");
+ if (!vfio_pci_populate_device(vdev, errp)) {
goto error;
}
@@ -3210,7 +3271,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
goto out_teardown;
}
- if (!vfio_add_capabilities(vdev, errp)) {
+ if (!vfio_pci_add_capabilities(vdev, errp)) {
goto out_unset_idev;
}
@@ -3226,7 +3287,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
vfio_bar_quirk_setup(vdev, i);
}
- if (!vfio_interrupt_setup(vdev, errp)) {
+ if (!vfio_pci_interrupt_setup(vdev, errp)) {
goto out_unset_idev;
}
@@ -3270,8 +3331,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
}
}
- vfio_register_err_notifier(vdev);
- vfio_register_req_notifier(vdev);
+ vfio_pci_register_err_notifier(vdev);
+ vfio_pci_register_req_notifier(vdev);
vfio_setup_resetfn_quirk(vdev);
return;
@@ -3292,8 +3353,8 @@ out_unset_idev:
pci_device_unset_iommu_device(pdev);
}
out_teardown:
- vfio_teardown_msi(vdev);
- vfio_bars_exit(vdev);
+ vfio_pci_teardown_msi(vdev);
+ vfio_pci_bars_exit(vdev);
error:
error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
}
@@ -3302,17 +3363,6 @@ static void vfio_instance_finalize(Object *obj)
{
VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
- vfio_display_finalize(vdev);
- vfio_bars_finalize(vdev);
- g_free(vdev->emulated_config_bits);
- g_free(vdev->rom);
- /*
- * XXX Leaking igd_opregion is not an oversight, we can't remove the
- * fw_cfg entry therefore leaking this allocation seems like the safest
- * option.
- *
- * g_free(vdev->igd_opregion);
- */
vfio_pci_put_device(vdev);
}
@@ -3331,9 +3381,9 @@ static void vfio_exitfn(PCIDevice *pdev)
if (vdev->intx.mmap_timer) {
timer_free(vdev->intx.mmap_timer);
}
- vfio_teardown_msi(vdev);
+ vfio_pci_teardown_msi(vdev);
vfio_pci_disable_rp_atomics(vdev);
- vfio_bars_exit(vdev);
+ vfio_pci_bars_exit(vdev);
vfio_migration_exit(vbasedev);
if (!vbasedev->mdev) {
pci_device_unset_iommu_device(pdev);
@@ -3344,6 +3394,11 @@ static void vfio_pci_reset(DeviceState *dev)
{
VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
+ /* Do not reset the device during qemu_system_reset prior to cpr load */
+ if (cpr_is_incoming()) {
+ return;
+ }
+
trace_vfio_pci_reset(vdev->vbasedev.name);
vfio_pci_pre_reset(vdev);
@@ -3401,6 +3456,13 @@ static void vfio_instance_init(Object *obj)
/* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
* line, therefore, no need to wait to realize like other devices */
pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
+
+ /*
+ * A device that is resuming for cpr is already configured, so do not
+ * reset it during qemu_system_reset prior to cpr load, else interrupts
+ * may be lost.
+ */
+ pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR;
}
static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data)
@@ -3418,7 +3480,7 @@ static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data)
static const TypeInfo vfio_pci_base_dev_info = {
.name = TYPE_VFIO_PCI_BASE,
.parent = TYPE_PCI_DEVICE,
- .instance_size = 0,
+ .instance_size = sizeof(VFIOPCIDevice),
.abstract = true,
.class_init = vfio_pci_base_dev_class_init,
.interfaces = (const InterfaceInfo[]) {
@@ -3513,8 +3575,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data)
#ifdef CONFIG_IOMMUFD
object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
#endif
+ dc->vmsd = &vfio_cpr_pci_vmstate;
dc->desc = "VFIO-based PCI device assignment";
- pdc->realize = vfio_realize;
+ pdc->realize = vfio_pci_realize;
object_class_property_set_description(klass, /* 1.3 */
"host",
@@ -3640,7 +3703,6 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data)
static const TypeInfo vfio_pci_dev_info = {
.name = TYPE_VFIO_PCI,
.parent = TYPE_VFIO_PCI_BASE,
- .instance_size = sizeof(VFIOPCIDevice),
.class_init = vfio_pci_dev_class_init,
.instance_init = vfio_instance_init,
.instance_finalize = vfio_instance_finalize,
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 5ce0fb9..d3dc227 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -210,6 +210,14 @@ static inline bool vfio_is_vga(VFIOPCIDevice *vdev)
return class == PCI_CLASS_DISPLAY_VGA;
}
+/* MSI/MSI-X/INTx */
+void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr);
+void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
+ int vector_n, bool msix);
+void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
+void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
+bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp);
+
uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
void vfio_pci_write_config(PCIDevice *pdev,
uint32_t addr, uint32_t val, int len);
@@ -248,4 +256,15 @@ void vfio_display_finalize(VFIOPCIDevice *vdev);
extern const VMStateDescription vfio_display_vmstate;
+void vfio_pci_bars_exit(VFIOPCIDevice *vdev);
+bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp);
+bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp);
+bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp);
+void vfio_pci_intx_eoi(VFIODevice *vbasedev);
+void vfio_pci_put_device(VFIOPCIDevice *vdev);
+bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp);
+void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev);
+void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev);
+void vfio_pci_teardown_msi(VFIOPCIDevice *vdev);
+
#endif /* HW_VFIO_VFIO_PCI_H */
diff --git a/hw/vfio/region.c b/hw/vfio/region.c
index 34752c3..f5b8e3c 100644
--- a/hw/vfio/region.c
+++ b/hw/vfio/region.c
@@ -66,7 +66,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
}
ret = vbasedev->io_ops->region_write(vbasedev, region->nr,
- addr, size, &buf);
+ addr, size, &buf, region->post_wr);
if (ret != size) {
error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
",%d) failed: %s",
@@ -200,6 +200,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
region->size = info->size;
region->fd_offset = info->offset;
region->nr = index;
+ region->post_wr = false;
if (region->size) {
region->mem = g_new0(MemoryRegion, 1);
@@ -241,6 +242,7 @@ int vfio_region_mmap(VFIORegion *region)
{
int i, ret, prot = 0;
char *name;
+ int fd;
if (!region->mem) {
return 0;
@@ -271,14 +273,18 @@ int vfio_region_mmap(VFIORegion *region)
goto no_mmap;
}
+ /* Use the per-region fd if set, or the shared fd. */
+ fd = region->vbasedev->region_fds ?
+ region->vbasedev->region_fds[region->nr] :
+ region->vbasedev->fd,
+
map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align);
munmap(map_base, map_align - map_base);
munmap(map_align + region->mmaps[i].size,
align - (map_align - map_base));
region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot,
- MAP_SHARED | MAP_FIXED,
- region->vbasedev->fd,
+ MAP_SHARED | MAP_FIXED, fd,
region->fd_offset +
region->mmaps[i].offset);
if (region->mmaps[i].mmap == MAP_FAILED) {
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index e90ec9b..f06236f 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -2,7 +2,7 @@
# pci.c
vfio_intx_interrupt(const char *name, char line) " (%s) Pin %c"
-vfio_intx_eoi(const char *name) " (%s) EOI"
+vfio_pci_intx_eoi(const char *name) " (%s) EOI"
vfio_intx_enable_kvm(const char *name) " (%s) KVM INTx accel enabled"
vfio_intx_disable_kvm(const char *name) " (%s) KVM INTx accel disabled"
vfio_intx_update(const char *name, int new_irq, int target_irq) " (%s) IRQ moved %d -> %d"
@@ -35,8 +35,8 @@ vfio_pci_hot_reset(const char *name, const char *type) " (%s) %s"
vfio_pci_hot_reset_has_dep_devices(const char *name) "%s: hot reset dependent devices:"
vfio_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int group_id) "\t%04x:%02x:%02x.%x group %d"
vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %s"
-vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
-vfio_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
+vfio_pci_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
+vfio_pci_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d"
vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x"
vfio_pci_reset(const char *name) " (%s)"
diff --git a/hw/vfio/vfio-cpr.h b/hw/vfio/vfio-cpr.h
deleted file mode 100644
index 134b83a..0000000
--- a/hw/vfio/vfio-cpr.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * VFIO CPR
- *
- * Copyright (c) 2025 Oracle and/or its affiliates.
- *
- * SPDX-License-Identifier: GPL-2.0-or-later
- */
-
-#ifndef HW_VFIO_CPR_H
-#define HW_VFIO_CPR_H
-
-bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp);
-void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer);
-
-#endif /* HW_VFIO_CPR_H */
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index e20da95..7061b6e 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -209,6 +209,8 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
int ret;
Int128 llend;
Error *local_err = NULL;
+ MemoryRegion *mr;
+ hwaddr xlat;
if (iotlb->target_as != &address_space_memory) {
error_report("Wrong target AS \"%s\", only system memory is allowed",
@@ -228,11 +230,14 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
bool read_only;
- if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL,
- &local_err)) {
+ mr = memory_translate_iotlb(iotlb, &xlat, &local_err);
+ if (!mr) {
error_report_err(local_err);
return;
}
+ vaddr = memory_region_get_ram_ptr(mr) + xlat;
+ read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly;
+
ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova,
iotlb->addr_mask + 1, vaddr, read_only);
if (ret) {