From 0f7a903ba3f0f8dfb347fb15b783aade4833826e Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:11 +0530 Subject: vfio: Add function to unmap VFIO region This function will be used for migration region. Migration region is mmaped when migration starts and will be unmapped when migration is complete. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/common.c | 32 ++++++++++++++++++++++++++++---- hw/vfio/trace-events | 1 + 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 13471ae..c6e98b8 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -924,6 +924,18 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, return 0; } +static void vfio_subregion_unmap(VFIORegion *region, int index) +{ + trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), + region->mmaps[index].offset, + region->mmaps[index].offset + + region->mmaps[index].size - 1); + memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); + munmap(region->mmaps[index].mmap, region->mmaps[index].size); + object_unparent(OBJECT(®ion->mmaps[index].mem)); + region->mmaps[index].mmap = NULL; +} + int vfio_region_mmap(VFIORegion *region) { int i, prot = 0; @@ -954,10 +966,7 @@ int vfio_region_mmap(VFIORegion *region) region->mmaps[i].mmap = NULL; for (i--; i >= 0; i--) { - memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); - munmap(region->mmaps[i].mmap, region->mmaps[i].size); - object_unparent(OBJECT(®ion->mmaps[i].mem)); - region->mmaps[i].mmap = NULL; + vfio_subregion_unmap(region, i); } return ret; @@ -982,6 +991,21 @@ int vfio_region_mmap(VFIORegion *region) return 0; } +void vfio_region_unmap(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + vfio_subregion_unmap(region, i); + } + } +} + void vfio_region_exit(VFIORegion *region) { int i; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 93a0bc2..a0c7b49 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -113,6 +113,7 @@ vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Reg vfio_region_exit(const char *name, int index) "Device %s, region %d" vfio_region_finalize(const char *name, int index) "Device %s, region %d" vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" +vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]" vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" -- cgit v1.1 From e93b733bcf8ee185af14a0f90a217d51cf40e7ea Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:12 +0530 Subject: vfio: Add vfio_get_object callback to VFIODeviceOps Hook vfio_get_object callback for PCI devices. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Suggested-by: Cornelia Huck Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'hw') diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 0d83eb0..bffd5bf 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2394,10 +2394,18 @@ static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) } } +static Object *vfio_pci_get_object(VFIODevice *vbasedev) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + + return OBJECT(vdev); +} + static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, .vfio_eoi = vfio_intx_eoi, + .vfio_get_object = vfio_pci_get_object, }; int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) -- cgit v1.1 From c5e2fb3ce4dbb158732420fbd3b963eebbcd85c8 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:13 +0530 Subject: vfio: Add save and load functions for VFIO PCI devices Added functions to save and restore PCI device specific data, specifically config space of PCI device. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'hw') diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index bffd5bf..e27c88b 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -41,6 +41,7 @@ #include "trace.h" #include "qapi/error.h" #include "migration/blocker.h" +#include "migration/qemu-file.h" #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" @@ -2401,11 +2402,61 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev) return OBJECT(vdev); } +static bool vfio_msix_present(void *opaque, int version_id) +{ + PCIDevice *pdev = opaque; + + return msix_present(pdev); +} + +const VMStateDescription vmstate_vfio_pci_config = { + .name = "VFIOPCIDevice", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), + VMSTATE_END_OF_LIST() + } +}; + +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + + vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); +} + +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + PCIDevice *pdev = &vdev->pdev; + int ret; + + ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); + if (ret) { + return ret; + } + + vfio_pci_write_config(pdev, PCI_COMMAND, + pci_get_word(pdev->config + PCI_COMMAND), 2); + + if (msi_enabled(pdev)) { + vfio_msi_enable(vdev); + } else if (msix_enabled(pdev)) { + vfio_msix_enable(vdev); + } + + return ret; +} + static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, .vfio_eoi = vfio_intx_eoi, .vfio_get_object = vfio_pci_get_object, + .vfio_save_config = vfio_pci_save_config, + .vfio_load_config = vfio_pci_load_config, }; int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) -- cgit v1.1 From a9e271ec9b36ef4c7b5bc3b234c85d93931e192e Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:14 +0530 Subject: vfio: Add migration region initialization and finalize function Whether the VFIO device supports migration or not is decided based of migration region query. If migration region query is successful and migration region initialization is successful then migration is supported else migration is blocked. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Acked-by: Dr. David Alan Gilbert Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/meson.build | 1 + hw/vfio/migration.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 3 ++ 3 files changed, 126 insertions(+) create mode 100644 hw/vfio/migration.c (limited to 'hw') diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 37efa74..da9af29 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -2,6 +2,7 @@ vfio_ss = ss.source_set() vfio_ss.add(files( 'common.c', 'spapr.c', + 'migration.c', )) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c new file mode 100644 index 0000000..fd7faf4 --- /dev/null +++ b/hw/vfio/migration.c @@ -0,0 +1,122 @@ +/* + * Migration support for VFIO devices + * + * Copyright NVIDIA, Inc. 2020 + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include + +#include "hw/vfio/vfio-common.h" +#include "cpu.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "migration/register.h" +#include "migration/blocker.h" +#include "migration/misc.h" +#include "qapi/error.h" +#include "exec/ramlist.h" +#include "exec/ram_addr.h" +#include "pci.h" +#include "trace.h" + +static void vfio_migration_exit(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + vfio_region_exit(&migration->region); + vfio_region_finalize(&migration->region); + g_free(vbasedev->migration); + vbasedev->migration = NULL; +} + +static int vfio_migration_init(VFIODevice *vbasedev, + struct vfio_region_info *info) +{ + int ret; + Object *obj; + + if (!vbasedev->ops->vfio_get_object) { + return -EINVAL; + } + + obj = vbasedev->ops->vfio_get_object(vbasedev); + if (!obj) { + return -EINVAL; + } + + vbasedev->migration = g_new0(VFIOMigration, 1); + + ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region, + info->index, "migration"); + if (ret) { + error_report("%s: Failed to setup VFIO migration region %d: %s", + vbasedev->name, info->index, strerror(-ret)); + goto err; + } + + if (!vbasedev->migration->region.size) { + error_report("%s: Invalid zero-sized VFIO migration region %d", + vbasedev->name, info->index); + ret = -EINVAL; + goto err; + } + return 0; + +err: + vfio_migration_exit(vbasedev); + return ret; +} + +/* ---------------------------------------------------------------------- */ + +int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) +{ + struct vfio_region_info *info = NULL; + Error *local_err = NULL; + int ret; + + ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION, + VFIO_REGION_SUBTYPE_MIGRATION, &info); + if (ret) { + goto add_blocker; + } + + ret = vfio_migration_init(vbasedev, info); + if (ret) { + goto add_blocker; + } + + g_free(info); + trace_vfio_migration_probe(vbasedev->name, info->index); + return 0; + +add_blocker: + error_setg(&vbasedev->migration_blocker, + "VFIO device doesn't support migration"); + g_free(info); + + ret = migrate_add_blocker(vbasedev->migration_blocker, &local_err); + if (local_err) { + error_propagate(errp, local_err); + error_free(vbasedev->migration_blocker); + vbasedev->migration_blocker = NULL; + } + return ret; +} + +void vfio_migration_finalize(VFIODevice *vbasedev) +{ + if (vbasedev->migration) { + vfio_migration_exit(vbasedev); + } + + if (vbasedev->migration_blocker) { + migrate_del_blocker(vbasedev->migration_blocker); + error_free(vbasedev->migration_blocker); + vbasedev->migration_blocker = NULL; + } +} diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index a0c7b49..9ced5ec 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -145,3 +145,6 @@ vfio_display_edid_link_up(void) "" vfio_display_edid_link_down(void) "" vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u" vfio_display_edid_write_error(void) "" + +# migration.c +vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" -- cgit v1.1 From 02a7e71b1e5b1313060927e7c86a10be2d7083a7 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:15 +0530 Subject: vfio: Add VM state change handler to know state of VM VM state change handler is called on change in VM's state. Based on VM state, VFIO device state should be changed. Added read/write helper functions for migration region. Added function to set device_state. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Cornelia Huck [aw: lx -> HWADDR_PRIx, remove redundant parens] Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 2 + 2 files changed, 162 insertions(+) (limited to 'hw') diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index fd7faf4..e1ffae0 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -10,6 +10,7 @@ #include "qemu/osdep.h" #include +#include "sysemu/runstate.h" #include "hw/vfio/vfio-common.h" #include "cpu.h" #include "migration/migration.h" @@ -22,6 +23,157 @@ #include "exec/ram_addr.h" #include "pci.h" #include "trace.h" +#include "hw/hw.h" + +static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, + off_t off, bool iswrite) +{ + int ret; + + ret = iswrite ? pwrite(vbasedev->fd, val, count, off) : + pread(vbasedev->fd, val, count, off); + if (ret < count) { + error_report("vfio_mig_%s %d byte %s: failed at offset 0x%" + HWADDR_PRIx", err: %s", iswrite ? "write" : "read", count, + vbasedev->name, off, strerror(errno)); + return (ret < 0) ? ret : -EINVAL; + } + return 0; +} + +static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count, + off_t off, bool iswrite) +{ + int ret, done = 0; + __u8 *tbuf = buf; + + while (count) { + int bytes = 0; + + if (count >= 8 && !(off % 8)) { + bytes = 8; + } else if (count >= 4 && !(off % 4)) { + bytes = 4; + } else if (count >= 2 && !(off % 2)) { + bytes = 2; + } else { + bytes = 1; + } + + ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite); + if (ret) { + return ret; + } + + count -= bytes; + done += bytes; + off += bytes; + tbuf += bytes; + } + return done; +} + +#define vfio_mig_read(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, false) +#define vfio_mig_write(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, true) + +#define VFIO_MIG_STRUCT_OFFSET(f) \ + offsetof(struct vfio_device_migration_info, f) +/* + * Change the device_state register for device @vbasedev. Bits set in @mask + * are preserved, bits set in @value are set, and bits not set in either @mask + * or @value are cleared in device_state. If the register cannot be accessed, + * the resulting state would be invalid, or the device enters an error state, + * an error is returned. + */ + +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, + uint32_t value) +{ + VFIOMigration *migration = vbasedev->migration; + VFIORegion *region = &migration->region; + off_t dev_state_off = region->fd_offset + + VFIO_MIG_STRUCT_OFFSET(device_state); + uint32_t device_state; + int ret; + + ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state), + dev_state_off); + if (ret < 0) { + return ret; + } + + device_state = (device_state & mask) | value; + + if (!VFIO_DEVICE_STATE_VALID(device_state)) { + return -EINVAL; + } + + ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state), + dev_state_off); + if (ret < 0) { + int rret; + + rret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state), + dev_state_off); + + if ((rret < 0) || (VFIO_DEVICE_STATE_IS_ERROR(device_state))) { + hw_error("%s: Device in error state 0x%x", vbasedev->name, + device_state); + return rret ? rret : -EIO; + } + return ret; + } + + migration->device_state = device_state; + trace_vfio_migration_set_state(vbasedev->name, device_state); + return 0; +} + +static void vfio_vmstate_change(void *opaque, int running, RunState state) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + uint32_t value, mask; + int ret; + + if (vbasedev->migration->vm_running == running) { + return; + } + + if (running) { + /* + * Here device state can have one of _SAVING, _RESUMING or _STOP bit. + * Transition from _SAVING to _RUNNING can happen if there is migration + * failure, in that case clear _SAVING bit. + * Transition from _RESUMING to _RUNNING occurs during resuming + * phase, in that case clear _RESUMING bit. + * In both the above cases, set _RUNNING bit. + */ + mask = ~VFIO_DEVICE_STATE_MASK; + value = VFIO_DEVICE_STATE_RUNNING; + } else { + /* + * Here device state could be either _RUNNING or _SAVING|_RUNNING. Reset + * _RUNNING bit + */ + mask = ~VFIO_DEVICE_STATE_RUNNING; + value = 0; + } + + ret = vfio_migration_set_state(vbasedev, mask, value); + if (ret) { + /* + * Migration should be aborted in this case, but vm_state_notify() + * currently does not support reporting failures. + */ + error_report("%s: Failed to set device state 0x%x", vbasedev->name, + (migration->device_state & mask) | value); + qemu_file_set_error(migrate_get_current()->to_dst_file, ret); + } + vbasedev->migration->vm_running = running; + trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), + (migration->device_state & mask) | value); +} static void vfio_migration_exit(VFIODevice *vbasedev) { @@ -38,6 +190,7 @@ static int vfio_migration_init(VFIODevice *vbasedev, { int ret; Object *obj; + VFIOMigration *migration; if (!vbasedev->ops->vfio_get_object) { return -EINVAL; @@ -64,6 +217,10 @@ static int vfio_migration_init(VFIODevice *vbasedev, ret = -EINVAL; goto err; } + + migration = vbasedev->migration; + migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, + vbasedev); return 0; err: @@ -111,6 +268,9 @@ add_blocker: void vfio_migration_finalize(VFIODevice *vbasedev) { if (vbasedev->migration) { + VFIOMigration *migration = vbasedev->migration; + + qemu_del_vm_change_state_handler(migration->vm_state); vfio_migration_exit(vbasedev); } diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9ced5ec..41de81f 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -148,3 +148,5 @@ vfio_display_edid_write_error(void) "" # migration.c vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" +vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" +vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" -- cgit v1.1 From 050c588c2ef6edd75769e6c4869d0ad625d5be90 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:16 +0530 Subject: vfio: Add migration state change notifier Added migration state change notifier to get notification on migration state change. These states are translated to VFIO device state and conveyed to vendor driver. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 28 ++++++++++++++++++++++++++++ hw/vfio/trace-events | 1 + 2 files changed, 29 insertions(+) (limited to 'hw') diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index e1ffae0..7ec85b6 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -175,6 +175,30 @@ static void vfio_vmstate_change(void *opaque, int running, RunState state) (migration->device_state & mask) | value); } +static void vfio_migration_state_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + VFIOMigration *migration = container_of(notifier, VFIOMigration, + migration_state); + VFIODevice *vbasedev = migration->vbasedev; + int ret; + + trace_vfio_migration_state_notifier(vbasedev->name, + MigrationStatus_str(s->state)); + + switch (s->state) { + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_FAILED: + ret = vfio_migration_set_state(vbasedev, + ~(VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING), + VFIO_DEVICE_STATE_RUNNING); + if (ret) { + error_report("%s: Failed to set state RUNNING", vbasedev->name); + } + } +} + static void vfio_migration_exit(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -219,8 +243,11 @@ static int vfio_migration_init(VFIODevice *vbasedev, } migration = vbasedev->migration; + migration->vbasedev = vbasedev; migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, vbasedev); + migration->migration_state.notify = vfio_migration_state_notifier; + add_migration_state_change_notifier(&migration->migration_state); return 0; err: @@ -270,6 +297,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev) if (vbasedev->migration) { VFIOMigration *migration = vbasedev->migration; + remove_migration_state_change_notifier(&migration->migration_state); qemu_del_vm_change_state_handler(migration->vm_state); vfio_migration_exit(vbasedev); } diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 41de81f..78d7d83 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -150,3 +150,4 @@ vfio_display_edid_write_error(void) "" vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" +vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" -- cgit v1.1 From 7c2f5f75f94a8820023a46169a4369fd8189a23c Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:17 +0530 Subject: vfio: Register SaveVMHandlers for VFIO device Define flags to be used as delimiter in migration stream for VFIO devices. Added .save_setup and .save_cleanup functions. Map & unmap migration region from these functions at source during saving or pre-copy phase. Set VFIO device state depending on VM's state. During live migration, VM is running when .save_setup is called, _SAVING | _RUNNING state is set for VFIO device. During save-restore, VM is paused, _SAVING state is set for VFIO device. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Cornelia Huck Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 2 + 2 files changed, 104 insertions(+) (limited to 'hw') diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 7ec85b6..ca6fd89 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -8,12 +8,15 @@ */ #include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "qemu/cutils.h" #include #include "sysemu/runstate.h" #include "hw/vfio/vfio-common.h" #include "cpu.h" #include "migration/migration.h" +#include "migration/vmstate.h" #include "migration/qemu-file.h" #include "migration/register.h" #include "migration/blocker.h" @@ -25,6 +28,22 @@ #include "trace.h" #include "hw/hw.h" +/* + * Flags to be used as unique delimiters for VFIO devices in the migration + * stream. These flags are composed as: + * 0xffffffff => MSB 32-bit all 1s + * 0xef10 => Magic ID, represents emulated (virtual) function IO + * 0x0000 => 16-bits reserved for flags + * + * The beginning of state information is marked by _DEV_CONFIG_STATE, + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a + * certain state information is marked by _END_OF_STATE. + */ +#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) +#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) +#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) + static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, off_t off, bool iswrite) { @@ -129,6 +148,75 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, return 0; } +static void vfio_migration_cleanup(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + if (migration->region.mmaps) { + vfio_region_unmap(&migration->region); + } +} + +/* ---------------------------------------------------------------------- */ + +static int vfio_save_setup(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; + + trace_vfio_save_setup(vbasedev->name); + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); + + if (migration->region.mmaps) { + /* + * Calling vfio_region_mmap() from migration thread. Memory API called + * from this function require locking the iothread when called from + * outside the main loop thread. + */ + qemu_mutex_lock_iothread(); + ret = vfio_region_mmap(&migration->region); + qemu_mutex_unlock_iothread(); + if (ret) { + error_report("%s: Failed to mmap VFIO migration region: %s", + vbasedev->name, strerror(-ret)); + error_report("%s: Falling back to slow path", vbasedev->name); + } + } + + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_MASK, + VFIO_DEVICE_STATE_SAVING); + if (ret) { + error_report("%s: Failed to set state SAVING", vbasedev->name); + return ret; + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + + return 0; +} + +static void vfio_save_cleanup(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + vfio_migration_cleanup(vbasedev); + trace_vfio_save_cleanup(vbasedev->name); +} + +static SaveVMHandlers savevm_vfio_handlers = { + .save_setup = vfio_save_setup, + .save_cleanup = vfio_save_cleanup, +}; + +/* ---------------------------------------------------------------------- */ + static void vfio_vmstate_change(void *opaque, int running, RunState state) { VFIODevice *vbasedev = opaque; @@ -215,6 +303,8 @@ static int vfio_migration_init(VFIODevice *vbasedev, int ret; Object *obj; VFIOMigration *migration; + char id[256] = ""; + g_autofree char *path = NULL, *oid = NULL; if (!vbasedev->ops->vfio_get_object) { return -EINVAL; @@ -244,6 +334,18 @@ static int vfio_migration_init(VFIODevice *vbasedev, migration = vbasedev->migration; migration->vbasedev = vbasedev; + + oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); + if (oid) { + path = g_strdup_printf("%s/vfio", oid); + } else { + path = g_strdup("vfio"); + } + strpadcpy(id, sizeof(id), path, '\0'); + + register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, + vbasedev); + migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, vbasedev); migration->migration_state.notify = vfio_migration_state_notifier; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 78d7d83..f148b5e 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -151,3 +151,5 @@ vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" +vfio_save_setup(const char *name) " (%s)" +vfio_save_cleanup(const char *name) " (%s)" -- cgit v1.1 From 1bc3c535ffbe512126a02b9f588497d5f5b7075b Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:18 +0530 Subject: vfio: Add save state functions to SaveVMHandlers Added .save_live_pending, .save_live_iterate and .save_live_complete_precopy functions. These functions handles pre-copy and stop-and-copy phase. In _SAVING|_RUNNING device state or pre-copy phase: - read pending_bytes. If pending_bytes > 0, go through below steps. - read data_offset - indicates kernel driver to write data to staging buffer. - read data_size - amount of data in bytes written by vendor driver in migration region. - read data_size bytes of data from data_offset in the migration region. - Write data packet to file stream as below: {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data, VFIO_MIG_FLAG_END_OF_STATE } In _SAVING device state or stop-and-copy phase a. read config space of device and save to migration file stream. This doesn't need to be from vendor driver. Any other special config state from driver can be saved as data in following iteration. b. read pending_bytes. If pending_bytes > 0, go through below steps. c. read data_offset - indicates kernel driver to write data to staging buffer. d. read data_size - amount of data in bytes written by vendor driver in migration region. e. read data_size bytes of data from data_offset in the migration region. f. Write data packet as below: {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data} g. iterate through steps b to f while (pending_bytes > 0) h. Write {VFIO_MIG_FLAG_END_OF_STATE} When data region is mapped, its user's responsibility to read data from data_offset of data_size before moving to next steps. Added fix suggested by Artem Polyakov to reset pending_bytes in vfio_save_iterate(). Added fix suggested by Zhi Wang to add 0 as data size in migration stream and add END_OF_STATE delimiter to indicate phase complete. Suggested-by: Artem Polyakov Suggested-by: Zhi Wang Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 276 +++++++++++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 6 ++ 2 files changed, 282 insertions(+) (limited to 'hw') diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index ca6fd89..5e0c9e8 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -148,6 +148,151 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, return 0; } +static void *get_data_section_size(VFIORegion *region, uint64_t data_offset, + uint64_t data_size, uint64_t *size) +{ + void *ptr = NULL; + uint64_t limit = 0; + int i; + + if (!region->mmaps) { + if (size) { + *size = MIN(data_size, region->size - data_offset); + } + return ptr; + } + + for (i = 0; i < region->nr_mmaps; i++) { + VFIOMmap *map = region->mmaps + i; + + if ((data_offset >= map->offset) && + (data_offset < map->offset + map->size)) { + + /* check if data_offset is within sparse mmap areas */ + ptr = map->mmap + data_offset - map->offset; + if (size) { + *size = MIN(data_size, map->offset + map->size - data_offset); + } + break; + } else if ((data_offset < map->offset) && + (!limit || limit > map->offset)) { + /* + * data_offset is not within sparse mmap areas, find size of + * non-mapped area. Check through all list since region->mmaps list + * is not sorted. + */ + limit = map->offset; + } + } + + if (!ptr && size) { + *size = limit ? MIN(data_size, limit - data_offset) : data_size; + } + return ptr; +} + +static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size) +{ + VFIOMigration *migration = vbasedev->migration; + VFIORegion *region = &migration->region; + uint64_t data_offset = 0, data_size = 0, sz; + int ret; + + ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset)); + if (ret < 0) { + return ret; + } + + ret = vfio_mig_read(vbasedev, &data_size, sizeof(data_size), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size)); + if (ret < 0) { + return ret; + } + + trace_vfio_save_buffer(vbasedev->name, data_offset, data_size, + migration->pending_bytes); + + qemu_put_be64(f, data_size); + sz = data_size; + + while (sz) { + void *buf; + uint64_t sec_size; + bool buf_allocated = false; + + buf = get_data_section_size(region, data_offset, sz, &sec_size); + + if (!buf) { + buf = g_try_malloc(sec_size); + if (!buf) { + error_report("%s: Error allocating buffer ", __func__); + return -ENOMEM; + } + buf_allocated = true; + + ret = vfio_mig_read(vbasedev, buf, sec_size, + region->fd_offset + data_offset); + if (ret < 0) { + g_free(buf); + return ret; + } + } + + qemu_put_buffer(f, buf, sec_size); + + if (buf_allocated) { + g_free(buf); + } + sz -= sec_size; + data_offset += sec_size; + } + + ret = qemu_file_get_error(f); + + if (!ret && size) { + *size = data_size; + } + + return ret; +} + +static int vfio_update_pending(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + VFIORegion *region = &migration->region; + uint64_t pending_bytes = 0; + int ret; + + ret = vfio_mig_read(vbasedev, &pending_bytes, sizeof(pending_bytes), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(pending_bytes)); + if (ret < 0) { + migration->pending_bytes = 0; + return ret; + } + + migration->pending_bytes = pending_bytes; + trace_vfio_update_pending(vbasedev->name, pending_bytes); + return 0; +} + +static int vfio_save_device_config_state(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); + + if (vbasedev->ops && vbasedev->ops->vfio_save_config) { + vbasedev->ops->vfio_save_config(vbasedev, f); + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + trace_vfio_save_device_config_state(vbasedev->name); + + return qemu_file_get_error(f); +} + static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -210,9 +355,140 @@ static void vfio_save_cleanup(void *opaque) trace_vfio_save_cleanup(vbasedev->name); } +static void vfio_save_pending(QEMUFile *f, void *opaque, + uint64_t threshold_size, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; + + ret = vfio_update_pending(vbasedev); + if (ret) { + return; + } + + *res_precopy_only += migration->pending_bytes; + + trace_vfio_save_pending(vbasedev->name, *res_precopy_only, + *res_postcopy_only, *res_compatible); +} + +static int vfio_save_iterate(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + uint64_t data_size; + int ret; + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); + + if (migration->pending_bytes == 0) { + ret = vfio_update_pending(vbasedev); + if (ret) { + return ret; + } + + if (migration->pending_bytes == 0) { + qemu_put_be64(f, 0); + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + /* indicates data finished, goto complete phase */ + return 1; + } + } + + ret = vfio_save_buffer(f, vbasedev, &data_size); + if (ret) { + error_report("%s: vfio_save_buffer failed %s", vbasedev->name, + strerror(errno)); + return ret; + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + + /* + * Reset pending_bytes as .save_live_pending is not called during savevm or + * snapshot case, in such case vfio_update_pending() at the start of this + * function updates pending_bytes. + */ + migration->pending_bytes = 0; + trace_vfio_save_iterate(vbasedev->name, data_size); + return 0; +} + +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + uint64_t data_size; + int ret; + + ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_RUNNING, + VFIO_DEVICE_STATE_SAVING); + if (ret) { + error_report("%s: Failed to set state STOP and SAVING", + vbasedev->name); + return ret; + } + + ret = vfio_save_device_config_state(f, opaque); + if (ret) { + return ret; + } + + ret = vfio_update_pending(vbasedev); + if (ret) { + return ret; + } + + while (migration->pending_bytes > 0) { + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); + ret = vfio_save_buffer(f, vbasedev, &data_size); + if (ret < 0) { + error_report("%s: Failed to save buffer", vbasedev->name); + return ret; + } + + if (data_size == 0) { + break; + } + + ret = vfio_update_pending(vbasedev); + if (ret) { + return ret; + } + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + + ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_SAVING, 0); + if (ret) { + error_report("%s: Failed to set state STOPPED", vbasedev->name); + return ret; + } + + trace_vfio_save_complete_precopy(vbasedev->name); + return ret; +} + static SaveVMHandlers savevm_vfio_handlers = { .save_setup = vfio_save_setup, .save_cleanup = vfio_save_cleanup, + .save_live_pending = vfio_save_pending, + .save_live_iterate = vfio_save_iterate, + .save_live_complete_precopy = vfio_save_complete_precopy, }; /* ---------------------------------------------------------------------- */ diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index f148b5e..9f5712d 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -153,3 +153,9 @@ vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" vfio_save_setup(const char *name) " (%s)" vfio_save_cleanup(const char *name) " (%s)" +vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64 +vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64 +vfio_save_device_config_state(const char *name) " (%s)" +vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64 +vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d" +vfio_save_complete_precopy(const char *name) " (%s)" -- cgit v1.1 From 3336d21710130a3d64c901aeae2dc66c364f93ad Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:19 +0530 Subject: vfio: Add load state functions to SaveVMHandlers Sequence during _RESUMING device state: While data for this device is available, repeat below steps: a. read data_offset from where user application should write data. b. write data of data_size to migration region from data_offset. c. write data_size which indicates vendor driver that data is written in staging buffer. For user, data is opaque. User should write data in the same order as received. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 4 ++ 2 files changed, 199 insertions(+) (limited to 'hw') diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 5e0c9e8..1af0fce 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -257,6 +257,77 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size) return ret; } +static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, + uint64_t data_size) +{ + VFIORegion *region = &vbasedev->migration->region; + uint64_t data_offset = 0, size, report_size; + int ret; + + do { + ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset)); + if (ret < 0) { + return ret; + } + + if (data_offset + data_size > region->size) { + /* + * If data_size is greater than the data section of migration region + * then iterate the write buffer operation. This case can occur if + * size of migration region at destination is smaller than size of + * migration region at source. + */ + report_size = size = region->size - data_offset; + data_size -= size; + } else { + report_size = size = data_size; + data_size = 0; + } + + trace_vfio_load_state_device_data(vbasedev->name, data_offset, size); + + while (size) { + void *buf; + uint64_t sec_size; + bool buf_alloc = false; + + buf = get_data_section_size(region, data_offset, size, &sec_size); + + if (!buf) { + buf = g_try_malloc(sec_size); + if (!buf) { + error_report("%s: Error allocating buffer ", __func__); + return -ENOMEM; + } + buf_alloc = true; + } + + qemu_get_buffer(f, buf, sec_size); + + if (buf_alloc) { + ret = vfio_mig_write(vbasedev, buf, sec_size, + region->fd_offset + data_offset); + g_free(buf); + + if (ret < 0) { + return ret; + } + } + size -= sec_size; + data_offset += sec_size; + } + + ret = vfio_mig_write(vbasedev, &report_size, sizeof(report_size), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size)); + if (ret < 0) { + return ret; + } + } while (data_size); + + return 0; +} + static int vfio_update_pending(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -293,6 +364,33 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque) return qemu_file_get_error(f); } +static int vfio_load_device_config_state(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + uint64_t data; + + if (vbasedev->ops && vbasedev->ops->vfio_load_config) { + int ret; + + ret = vbasedev->ops->vfio_load_config(vbasedev, f); + if (ret) { + error_report("%s: Failed to load device config space", + vbasedev->name); + return ret; + } + } + + data = qemu_get_be64(f); + if (data != VFIO_MIG_FLAG_END_OF_STATE) { + error_report("%s: Failed loading device config space, " + "end flag incorrect 0x%"PRIx64, vbasedev->name, data); + return -EINVAL; + } + + trace_vfio_load_device_config_state(vbasedev->name); + return qemu_file_get_error(f); +} + static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -483,12 +581,109 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) return ret; } +static int vfio_load_setup(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret = 0; + + if (migration->region.mmaps) { + ret = vfio_region_mmap(&migration->region); + if (ret) { + error_report("%s: Failed to mmap VFIO migration region %d: %s", + vbasedev->name, migration->region.nr, + strerror(-ret)); + error_report("%s: Falling back to slow path", vbasedev->name); + } + } + + ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_MASK, + VFIO_DEVICE_STATE_RESUMING); + if (ret) { + error_report("%s: Failed to set state RESUMING", vbasedev->name); + if (migration->region.mmaps) { + vfio_region_unmap(&migration->region); + } + } + return ret; +} + +static int vfio_load_cleanup(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + vfio_migration_cleanup(vbasedev); + trace_vfio_load_cleanup(vbasedev->name); + return 0; +} + +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) +{ + VFIODevice *vbasedev = opaque; + int ret = 0; + uint64_t data; + + data = qemu_get_be64(f); + while (data != VFIO_MIG_FLAG_END_OF_STATE) { + + trace_vfio_load_state(vbasedev->name, data); + + switch (data) { + case VFIO_MIG_FLAG_DEV_CONFIG_STATE: + { + ret = vfio_load_device_config_state(f, opaque); + if (ret) { + return ret; + } + break; + } + case VFIO_MIG_FLAG_DEV_SETUP_STATE: + { + data = qemu_get_be64(f); + if (data == VFIO_MIG_FLAG_END_OF_STATE) { + return ret; + } else { + error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, + vbasedev->name, data); + return -EINVAL; + } + break; + } + case VFIO_MIG_FLAG_DEV_DATA_STATE: + { + uint64_t data_size = qemu_get_be64(f); + + if (data_size) { + ret = vfio_load_buffer(f, vbasedev, data_size); + if (ret < 0) { + return ret; + } + } + break; + } + default: + error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); + return -EINVAL; + } + + data = qemu_get_be64(f); + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + } + return ret; +} + static SaveVMHandlers savevm_vfio_handlers = { .save_setup = vfio_save_setup, .save_cleanup = vfio_save_cleanup, .save_live_pending = vfio_save_pending, .save_live_iterate = vfio_save_iterate, .save_live_complete_precopy = vfio_save_complete_precopy, + .load_setup = vfio_load_setup, + .load_cleanup = vfio_load_cleanup, + .load_state = vfio_load_state, }; /* ---------------------------------------------------------------------- */ diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9f5712d..a75b520 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -159,3 +159,7 @@ vfio_save_device_config_state(const char *name) " (%s)" vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64 vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d" vfio_save_complete_precopy(const char *name) " (%s)" +vfio_load_device_config_state(const char *name) " (%s)" +vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 +vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64 +vfio_load_cleanup(const char *name) " (%s)" -- cgit v1.1 From 87ea529c5020124440cd892a038dffe6057fd613 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:21 +0530 Subject: vfio: Get migration capability flags for container Added helper functions to get IOMMU info capability chain. Added function to get migration capability information from that capability chain for IOMMU container. Similar change was proposed earlier: https://lists.gnu.org/archive/html/qemu-devel/2018-05/msg03759.html Disable migration for devices if IOMMU module doesn't support migration capability. Signed-off-by: Kirti Wankhede Cc: Shameer Kolothum Cc: Eric Auger Signed-off-by: Alex Williamson --- hw/vfio/common.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++----- hw/vfio/migration.c | 7 ++++- 2 files changed, 88 insertions(+), 9 deletions(-) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index c6e98b8..d4959c0 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1228,6 +1228,75 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, return 0; } +static int vfio_get_iommu_info(VFIOContainer *container, + struct vfio_iommu_type1_info **info) +{ + + size_t argsz = sizeof(struct vfio_iommu_type1_info); + + *info = g_new0(struct vfio_iommu_type1_info, 1); +again: + (*info)->argsz = argsz; + + if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) { + g_free(*info); + *info = NULL; + return -errno; + } + + if (((*info)->argsz > argsz)) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + goto again; + } + + return 0; +} + +static struct vfio_info_cap_header * +vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +{ + struct vfio_info_cap_header *hdr; + void *ptr = info; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; + } + + for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } + } + + return NULL; +} + +static void vfio_get_iommu_info_migration(VFIOContainer *container, + struct vfio_iommu_type1_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_migration *cap_mig; + + hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); + if (!hdr) { + return; + } + + cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration, + header); + + /* + * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of + * TARGET_PAGE_SIZE to mark those dirty. + */ + if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) { + container->dirty_pages_supported = true; + container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + container->dirty_pgsizes = cap_mig->pgsize_bitmap; + } +} + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -1297,6 +1366,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->space = space; container->fd = fd; container->error = NULL; + container->dirty_pages_supported = false; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); @@ -1309,7 +1379,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: { - struct vfio_iommu_type1_info info; + struct vfio_iommu_type1_info *info; /* * FIXME: This assumes that a Type1 IOMMU can map any 64-bit @@ -1318,15 +1388,19 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * existing Type1 IOMMUs generally support any IOVA we're * going to actually try in practice. */ - info.argsz = sizeof(info); - ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info); - /* Ignore errors */ - if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) { + ret = vfio_get_iommu_info(container, &info); + + if (ret || !(info->flags & VFIO_IOMMU_INFO_PGSIZES)) { /* Assume 4k IOVA page size */ - info.iova_pgsizes = 4096; + info->iova_pgsizes = 4096; } - vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes); - container->pgsizes = info.iova_pgsizes; + vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes); + container->pgsizes = info->iova_pgsizes; + + if (!ret) { + vfio_get_iommu_info_migration(container, info); + } + g_free(info); break; } case VFIO_SPAPR_TCE_v2_IOMMU: diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 1af0fce..39503b4 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -832,9 +832,14 @@ err: int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) { + VFIOContainer *container = vbasedev->group->container; struct vfio_region_info *info = NULL; Error *local_err = NULL; - int ret; + int ret = -ENOTSUP; + + if (!container->dirty_pages_supported) { + goto add_blocker; + } ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION, VFIO_REGION_SUBTYPE_MIGRATION, &info); -- cgit v1.1 From e663f516830c61f1dcafd2dda810126c14327b15 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:22 +0530 Subject: vfio: Add function to start and stop dirty pages tracking Call VFIO_IOMMU_DIRTY_PAGES ioctl to start and stop dirty pages tracking for VFIO devices. Signed-off-by: Kirti Wankhede Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'hw') diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 39503b4..a248eff 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -11,6 +11,7 @@ #include "qemu/main-loop.h" #include "qemu/cutils.h" #include +#include #include "sysemu/runstate.h" #include "hw/vfio/vfio-common.h" @@ -391,10 +392,40 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque) return qemu_file_get_error(f); } +static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start) +{ + int ret; + VFIOMigration *migration = vbasedev->migration; + VFIOContainer *container = vbasedev->group->container; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), + }; + + if (start) { + if (migration->device_state & VFIO_DEVICE_STATE_SAVING) { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + } else { + return -EINVAL; + } + } else { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); + if (ret) { + error_report("Failed to set dirty tracking flag 0x%x errno: %d", + dirty.flags, errno); + return -errno; + } + return ret; +} + static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; + vfio_set_dirty_page_tracking(vbasedev, false); + if (migration->region.mmaps) { vfio_region_unmap(&migration->region); } @@ -435,6 +466,11 @@ static int vfio_save_setup(QEMUFile *f, void *opaque) return ret; } + ret = vfio_set_dirty_page_tracking(vbasedev, true); + if (ret) { + return ret; + } + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); ret = qemu_file_get_error(f); -- cgit v1.1 From b6dd6504e303d4339df2db765433bffa36dfbf8b Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:23 +0530 Subject: vfio: Add vfio_listener_log_sync to mark dirty pages vfio_listener_log_sync gets list of dirty pages from container using VFIO_IOMMU_GET_DIRTY_BITMAP ioctl and mark those pages dirty when all devices are stopped and saving state. Return early for the RAM block section of mapped MMIO region. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia [aw: fix error_report types, fix cpu_physical_memory_set_dirty_lebitmap() cast] Signed-off-by: Alex Williamson --- hw/vfio/common.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 1 + 2 files changed, 117 insertions(+) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d4959c0..0a97fbf 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -29,6 +29,7 @@ #include "hw/vfio/vfio.h" #include "exec/address-spaces.h" #include "exec/memory.h" +#include "exec/ram_addr.h" #include "hw/hw.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -37,6 +38,7 @@ #include "sysemu/reset.h" #include "trace.h" #include "qapi/error.h" +#include "migration/migration.h" VFIOGroupList vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -287,6 +289,39 @@ const MemoryRegionOps vfio_region_ops = { }; /* + * Device state interfaces + */ + +static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + MigrationState *ms = migrate_get_current(); + + if (!migration_is_setup_or_active(ms->state)) { + return false; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { + return false; + } + + if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && + !(migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { + continue; + } else { + return false; + } + } + } + return true; +} + +/* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ static int vfio_dma_unmap(VFIOContainer *container, @@ -812,9 +847,90 @@ static void vfio_listener_region_del(MemoryListener *listener, } } +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + uint64_t size, ram_addr_t ram_addr) +{ + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + uint64_t pages; + int ret; + + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); + dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + range->iova = iova; + range->size = size; + + /* + * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of + * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to + * TARGET_PAGE_SIZE. + */ + range->bitmap.pgsize = TARGET_PAGE_SIZE; + + pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS; + range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + range->bitmap.data = g_try_malloc0(range->bitmap.size); + if (!range->bitmap.data) { + ret = -ENOMEM; + goto err_out; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { + error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, + (uint64_t)range->size, errno); + goto err_out; + } + + cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data, + ram_addr, pages); + + trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, + range->bitmap.size, ram_addr); +err_out: + g_free(range->bitmap.data); + g_free(dbitmap); + + return ret; +} + +static int vfio_sync_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) +{ + ram_addr_t ram_addr; + + ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + + return vfio_get_dirty_bitmap(container, + TARGET_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr); +} + +static void vfio_listerner_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + if (vfio_listener_skipped_section(section) || + !container->dirty_pages_supported) { + return; + } + + if (vfio_devices_all_stopped_and_saving(container)) { + vfio_sync_dirty_bitmap(container, section); + } +} + static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, + .log_sync = vfio_listerner_log_sync, }; static void vfio_listener_release(VFIOContainer *container) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index a75b520..dd991bd 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -163,3 +163,4 @@ vfio_load_device_config_state(const char *name) " (%s)" vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64 vfio_load_cleanup(const char *name) " (%s)" +vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64 -- cgit v1.1 From 9a04fe09576b0399646e80e57ff2d2324f7cf64d Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:24 +0530 Subject: vfio: Dirty page tracking when vIOMMU is enabled When vIOMMU is enabled, register MAP notifier from log_sync when all devices in container are in stop and copy phase of migration. Call replay and get dirty pages from notifier callback. Suggested-by: Alex Williamson Signed-off-by: Kirti Wankhede Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- hw/vfio/common.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++---- hw/vfio/trace-events | 1 + 2 files changed, 83 insertions(+), 6 deletions(-) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 0a97fbf..43e6e89 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -442,8 +442,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) } /* Called with rcu_read_lock held. */ -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr, - bool *read_only) +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + ram_addr_t *ram_addr, bool *read_only) { MemoryRegion *mr; hwaddr xlat; @@ -474,8 +474,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr, return false; } - *vaddr = memory_region_get_ram_ptr(mr) + xlat; - *read_only = !writable || mr->readonly; + if (vaddr) { + *vaddr = memory_region_get_ram_ptr(mr) + xlat; + } + + if (ram_addr) { + *ram_addr = memory_region_get_ram_addr(mr) + xlat; + } + + if (read_only) { + *read_only = !writable || mr->readonly; + } return true; } @@ -485,7 +494,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainer *container = giommu->container; hwaddr iova = iotlb->iova + giommu->iommu_offset; - bool read_only; void *vaddr; int ret; @@ -501,7 +509,9 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_lock(); if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { - if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) { + bool read_only; + + if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) { goto out; } /* @@ -899,11 +909,77 @@ err_out: return ret; } +typedef struct { + IOMMUNotifier n; + VFIOGuestIOMMU *giommu; +} vfio_giommu_dirty_notifier; + +static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + vfio_giommu_dirty_notifier *gdn = container_of(n, + vfio_giommu_dirty_notifier, n); + VFIOGuestIOMMU *giommu = gdn->giommu; + VFIOContainer *container = giommu->container; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + ram_addr_t translated_addr; + + trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); + + if (iotlb->target_as != &address_space_memory) { + error_report("Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + return; + } + + rcu_read_lock(); + if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { + int ret; + + ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, + translated_addr); + if (ret) { + error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%m)", + container, iova, + iotlb->addr_mask + 1, ret); + } + } + rcu_read_unlock(); +} + static int vfio_sync_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { ram_addr_t ram_addr; + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu) == section->mr && + giommu->n.start == section->offset_within_region) { + Int128 llend; + vfio_giommu_dirty_notifier gdn = { .giommu = giommu }; + int idx = memory_region_iommu_attrs_to_index(giommu->iommu, + MEMTXATTRS_UNSPECIFIED); + + llend = int128_add(int128_make64(section->offset_within_region), + section->size); + llend = int128_sub(llend, int128_one()); + + iommu_notifier_init(&gdn.n, + vfio_iommu_map_dirty_notify, + IOMMU_NOTIFIER_MAP, + section->offset_within_region, + int128_get64(llend), + idx); + memory_region_iommu_replay(giommu->iommu, &gdn.n); + break; + } + } + return 0; + } + ram_addr = memory_region_get_ram_addr(section->mr) + section->offset_within_region; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index dd991bd..c0e75f2 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -164,3 +164,4 @@ vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64 vfio_load_cleanup(const char *name) " (%s)" vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64 +vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 -- cgit v1.1 From 9e7b0442f23a92c27204d6f81a954f30f3126d33 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:25 +0530 Subject: vfio: Add ioctl to get dirty pages bitmap during dma unmap With vIOMMU, IO virtual address range can get unmapped while in pre-copy phase of migration. In that case, unmap ioctl should return pages pinned in that range and QEMU should find its correcponding guest physical addresses and report those dirty. Suggested-by: Alex Williamson Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia [aw: fix error_report types, fix cpu_physical_memory_set_dirty_lebitmap() cast] Signed-off-by: Alex Williamson --- hw/vfio/common.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 93 insertions(+), 4 deletions(-) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 43e6e89..620358a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -321,11 +321,95 @@ static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) return true; } +static bool vfio_devices_all_running_and_saving(VFIOContainer *container) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + MigrationState *ms = migrate_get_current(); + + if (!migration_is_setup_or_active(ms->state)) { + return false; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { + return false; + } + + if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && + (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { + continue; + } else { + return false; + } + } + } + return true; +} + +static int vfio_dma_unmap_bitmap(VFIOContainer *container, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + struct vfio_iommu_type1_dma_unmap *unmap; + struct vfio_bitmap *bitmap; + uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS; + int ret; + + unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); + + unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); + unmap->iova = iova; + unmap->size = size; + unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP; + bitmap = (struct vfio_bitmap *)&unmap->data; + + /* + * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of + * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to + * TARGET_PAGE_SIZE. + */ + + bitmap->pgsize = TARGET_PAGE_SIZE; + bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + + if (bitmap->size > container->max_dirty_bitmap_size) { + error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, + (uint64_t)bitmap->size); + ret = -E2BIG; + goto unmap_exit; + } + + bitmap->data = g_try_malloc0(bitmap->size); + if (!bitmap->data) { + ret = -ENOMEM; + goto unmap_exit; + } + + ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); + if (!ret) { + cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data, + iotlb->translated_addr, pages); + } else { + error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); + } + + g_free(bitmap->data); +unmap_exit: + g_free(unmap); + return ret; +} + /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ static int vfio_dma_unmap(VFIOContainer *container, - hwaddr iova, ram_addr_t size) + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) { struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), @@ -334,6 +418,11 @@ static int vfio_dma_unmap(VFIOContainer *container, .size = size, }; + if (iotlb && container->dirty_pages_supported && + vfio_devices_all_running_and_saving(container)) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { /* * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c @@ -381,7 +470,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, * the VGA ROM space. */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || - (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 && + (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 && ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { return 0; } @@ -531,7 +620,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) iotlb->addr_mask + 1, vaddr, ret); } } else { - ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1); + ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", @@ -834,7 +923,7 @@ static void vfio_listener_region_del(MemoryListener *listener, } if (try_unmap) { - ret = vfio_dma_unmap(container, iova, int128_get64(llsize)); + ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", -- cgit v1.1 From a22651053b59b7d40bf921e8819ea696a3b0a9d2 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:26 +0530 Subject: vfio: Make vfio-pci device migration capable If the device is not a failover primary device, call vfio_migration_probe() and vfio_migration_finalize() to enable migration support for those devices that support it respectively to tear it down again. Removed migration blocker from VFIO PCI device specific structure and use migration blocker from generic structure of VFIO device. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 28 ++++++++-------------------- hw/vfio/pci.h | 1 - 2 files changed, 8 insertions(+), 21 deletions(-) (limited to 'hw') diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e27c88b..58c0ce8 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2791,17 +2791,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) return; } - if (!pdev->failover_pair_id) { - error_setg(&vdev->migration_blocker, - "VFIO device doesn't support migration"); - ret = migrate_add_blocker(vdev->migration_blocker, errp); - if (ret) { - error_free(vdev->migration_blocker); - vdev->migration_blocker = NULL; - return; - } - } - vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev); vdev->vbasedev.ops = &vfio_pci_ops; vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; @@ -3069,6 +3058,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } + if (!pdev->failover_pair_id) { + ret = vfio_migration_probe(&vdev->vbasedev, errp); + if (ret) { + error_report("%s: Migration disabled", vdev->vbasedev.name); + } + } + vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); @@ -3083,11 +3079,6 @@ out_teardown: vfio_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); - if (vdev->migration_blocker) { - migrate_del_blocker(vdev->migration_blocker); - error_free(vdev->migration_blocker); - vdev->migration_blocker = NULL; - } } static void vfio_instance_finalize(Object *obj) @@ -3099,10 +3090,6 @@ static void vfio_instance_finalize(Object *obj) vfio_bars_finalize(vdev); g_free(vdev->emulated_config_bits); g_free(vdev->rom); - if (vdev->migration_blocker) { - migrate_del_blocker(vdev->migration_blocker); - error_free(vdev->migration_blocker); - } /* * XXX Leaking igd_opregion is not an oversight, we can't remove the * fw_cfg entry therefore leaking this allocation seems like the safest @@ -3130,6 +3117,7 @@ static void vfio_exitfn(PCIDevice *pdev) } vfio_teardown_msi(vdev); vfio_bars_exit(vdev); + vfio_migration_finalize(&vdev->vbasedev); } static void vfio_pci_reset(DeviceState *dev) diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index bce71a9..1574ef9 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -172,7 +172,6 @@ struct VFIOPCIDevice { bool no_vfio_ioeventfd; bool enable_ramfb; VFIODisplay *dpy; - Error *migration_blocker; Notifier irqchip_change_notifier; }; -- cgit v1.1 From 3710586caa5d91a52c0cf247e1c829a50f2e7b98 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:27 +0530 Subject: qapi: Add VFIO devices migration stats in Migration stats Added amount of bytes transferred to the VM at destination by all VFIO devices Signed-off-by: Kirti Wankhede Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alex Williamson --- hw/vfio/common.c | 19 +++++++++++++++++++ hw/vfio/migration.c | 9 +++++++++ 2 files changed, 28 insertions(+) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 620358a..d41ba67 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -292,6 +292,25 @@ const MemoryRegionOps vfio_region_ops = { * Device state interfaces */ +bool vfio_mig_active(void) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + + if (QLIST_EMPTY(&vfio_group_list)) { + return false; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (vbasedev->migration_blocker) { + return false; + } + } + } + return true; +} + static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) { VFIOGroup *group; diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index a248eff..3ce285e 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -45,6 +45,8 @@ #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) +static int64_t bytes_transferred; + static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, off_t off, bool iswrite) { @@ -255,6 +257,7 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size) *size = data_size; } + bytes_transferred += data_size; return ret; } @@ -785,6 +788,7 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_FAILED: + bytes_transferred = 0; ret = vfio_migration_set_state(vbasedev, ~(VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING), VFIO_DEVICE_STATE_RUNNING); @@ -866,6 +870,11 @@ err: /* ---------------------------------------------------------------------- */ +int64_t vfio_mig_bytes_transferred(void) +{ + return bytes_transferred; +} + int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) { VFIOContainer *container = vbasedev->group->container; -- cgit v1.1 From 408b55db8be3e3edae041d46ef8786fabc1476aa Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:31 -0400 Subject: s390x/pci: Move header files to include/hw/s390x Seems a more appropriate location for them. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 4 +- hw/s390x/s390-pci-bus.h | 372 --------------------------------------------- hw/s390x/s390-pci-inst.c | 4 +- hw/s390x/s390-pci-inst.h | 312 ------------------------------------- hw/s390x/s390-virtio-ccw.c | 2 +- 5 files changed, 5 insertions(+), 689 deletions(-) delete mode 100644 hw/s390x/s390-pci-bus.h delete mode 100644 hw/s390x/s390-pci-inst.h (limited to 'hw') diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index fb4cee8..a929340 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -15,8 +15,8 @@ #include "qapi/error.h" #include "qapi/visitor.h" #include "cpu.h" -#include "s390-pci-bus.h" -#include "s390-pci-inst.h" +#include "hw/s390x/s390-pci-bus.h" +#include "hw/s390x/s390-pci-inst.h" #include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" #include "hw/pci/pci_bridge.h" diff --git a/hw/s390x/s390-pci-bus.h b/hw/s390x/s390-pci-bus.h deleted file mode 100644 index 97464d0..0000000 --- a/hw/s390x/s390-pci-bus.h +++ /dev/null @@ -1,372 +0,0 @@ -/* - * s390 PCI BUS definitions - * - * Copyright 2014 IBM Corp. - * Author(s): Frank Blaschka - * Hong Bo Li - * Yi Min Zhao - * - * This work is licensed under the terms of the GNU GPL, version 2 or (at - * your option) any later version. See the COPYING file in the top-level - * directory. - */ - -#ifndef HW_S390_PCI_BUS_H -#define HW_S390_PCI_BUS_H - -#include "hw/pci/pci.h" -#include "hw/pci/pci_host.h" -#include "hw/s390x/sclp.h" -#include "hw/s390x/s390_flic.h" -#include "hw/s390x/css.h" -#include "qom/object.h" - -#define TYPE_S390_PCI_HOST_BRIDGE "s390-pcihost" -#define TYPE_S390_PCI_BUS "s390-pcibus" -#define TYPE_S390_PCI_DEVICE "zpci" -#define TYPE_S390_PCI_IOMMU "s390-pci-iommu" -#define TYPE_S390_IOMMU_MEMORY_REGION "s390-iommu-memory-region" -#define FH_MASK_ENABLE 0x80000000 -#define FH_MASK_INSTANCE 0x7f000000 -#define FH_MASK_SHM 0x00ff0000 -#define FH_MASK_INDEX 0x0000ffff -#define FH_SHM_VFIO 0x00010000 -#define FH_SHM_EMUL 0x00020000 -#define ZPCI_MAX_FID 0xffffffff -#define ZPCI_MAX_UID 0xffff -#define UID_UNDEFINED 0 -#define UID_CHECKING_ENABLED 0x01 - -OBJECT_DECLARE_SIMPLE_TYPE(S390pciState, S390_PCI_HOST_BRIDGE) -OBJECT_DECLARE_SIMPLE_TYPE(S390PCIBus, S390_PCI_BUS) -OBJECT_DECLARE_SIMPLE_TYPE(S390PCIBusDevice, S390_PCI_DEVICE) -OBJECT_DECLARE_SIMPLE_TYPE(S390PCIIOMMU, S390_PCI_IOMMU) - -#define HP_EVENT_TO_CONFIGURED 0x0301 -#define HP_EVENT_RESERVED_TO_STANDBY 0x0302 -#define HP_EVENT_DECONFIGURE_REQUEST 0x0303 -#define HP_EVENT_CONFIGURED_TO_STBRES 0x0304 -#define HP_EVENT_STANDBY_TO_RESERVED 0x0308 - -#define ERR_EVENT_INVALAS 0x1 -#define ERR_EVENT_OORANGE 0x2 -#define ERR_EVENT_INVALTF 0x3 -#define ERR_EVENT_TPROTE 0x4 -#define ERR_EVENT_APROTE 0x5 -#define ERR_EVENT_KEYE 0x6 -#define ERR_EVENT_INVALTE 0x7 -#define ERR_EVENT_INVALTL 0x8 -#define ERR_EVENT_TT 0x9 -#define ERR_EVENT_INVALMS 0xa -#define ERR_EVENT_SERR 0xb -#define ERR_EVENT_NOMSI 0x10 -#define ERR_EVENT_INVALBV 0x11 -#define ERR_EVENT_AIBV 0x12 -#define ERR_EVENT_AIRERR 0x13 -#define ERR_EVENT_FMBA 0x2a -#define ERR_EVENT_FMBUP 0x2b -#define ERR_EVENT_FMBPRO 0x2c -#define ERR_EVENT_CCONF 0x30 -#define ERR_EVENT_SERVAC 0x3a -#define ERR_EVENT_PERMERR 0x3b - -#define ERR_EVENT_Q_BIT 0x2 -#define ERR_EVENT_MVN_OFFSET 16 - -#define ZPCI_MSI_VEC_BITS 11 -#define ZPCI_MSI_VEC_MASK 0x7ff - -#define ZPCI_MSI_ADDR 0xfe00000000000000ULL -#define ZPCI_SDMA_ADDR 0x100000000ULL -#define ZPCI_EDMA_ADDR 0x1ffffffffffffffULL - -#define PAGE_SHIFT 12 -#define PAGE_SIZE (1 << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) -#define PAGE_DEFAULT_ACC 0 -#define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) - -/* I/O Translation Anchor (IOTA) */ -enum ZpciIoatDtype { - ZPCI_IOTA_STO = 0, - ZPCI_IOTA_RTTO = 1, - ZPCI_IOTA_RSTO = 2, - ZPCI_IOTA_RFTO = 3, - ZPCI_IOTA_PFAA = 4, - ZPCI_IOTA_IOPFAA = 5, - ZPCI_IOTA_IOPTO = 7 -}; - -#define ZPCI_IOTA_IOT_ENABLED 0x800ULL -#define ZPCI_IOTA_DT_ST (ZPCI_IOTA_STO << 2) -#define ZPCI_IOTA_DT_RT (ZPCI_IOTA_RTTO << 2) -#define ZPCI_IOTA_DT_RS (ZPCI_IOTA_RSTO << 2) -#define ZPCI_IOTA_DT_RF (ZPCI_IOTA_RFTO << 2) -#define ZPCI_IOTA_DT_PF (ZPCI_IOTA_PFAA << 2) -#define ZPCI_IOTA_FS_4K 0 -#define ZPCI_IOTA_FS_1M 1 -#define ZPCI_IOTA_FS_2G 2 -#define ZPCI_KEY (PAGE_DEFAULT_KEY << 5) - -#define ZPCI_IOTA_STO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_ST) -#define ZPCI_IOTA_RTTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RT) -#define ZPCI_IOTA_RSTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RS) -#define ZPCI_IOTA_RFTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RF) -#define ZPCI_IOTA_RFAA_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY |\ - ZPCI_IOTA_DT_PF | ZPCI_IOTA_FS_2G) - -/* I/O Region and segment tables */ -#define ZPCI_INDEX_MASK 0x7ffULL - -#define ZPCI_TABLE_TYPE_MASK 0xc -#define ZPCI_TABLE_TYPE_RFX 0xc -#define ZPCI_TABLE_TYPE_RSX 0x8 -#define ZPCI_TABLE_TYPE_RTX 0x4 -#define ZPCI_TABLE_TYPE_SX 0x0 - -#define ZPCI_TABLE_LEN_RFX 0x3 -#define ZPCI_TABLE_LEN_RSX 0x3 -#define ZPCI_TABLE_LEN_RTX 0x3 - -#define ZPCI_TABLE_OFFSET_MASK 0xc0 -#define ZPCI_TABLE_SIZE 0x4000 -#define ZPCI_TABLE_ALIGN ZPCI_TABLE_SIZE -#define ZPCI_TABLE_ENTRY_SIZE (sizeof(unsigned long)) -#define ZPCI_TABLE_ENTRIES (ZPCI_TABLE_SIZE / ZPCI_TABLE_ENTRY_SIZE) - -#define ZPCI_TABLE_BITS 11 -#define ZPCI_PT_BITS 8 -#define ZPCI_ST_SHIFT (ZPCI_PT_BITS + PAGE_SHIFT) -#define ZPCI_RT_SHIFT (ZPCI_ST_SHIFT + ZPCI_TABLE_BITS) - -#define ZPCI_RTE_FLAG_MASK 0x3fffULL -#define ZPCI_RTE_ADDR_MASK (~ZPCI_RTE_FLAG_MASK) -#define ZPCI_STE_FLAG_MASK 0x7ffULL -#define ZPCI_STE_ADDR_MASK (~ZPCI_STE_FLAG_MASK) - -#define ZPCI_SFAA_MASK (~((1ULL << 20) - 1)) - -/* I/O Page tables */ -#define ZPCI_PTE_VALID_MASK 0x400 -#define ZPCI_PTE_INVALID 0x400 -#define ZPCI_PTE_VALID 0x000 -#define ZPCI_PT_SIZE 0x800 -#define ZPCI_PT_ALIGN ZPCI_PT_SIZE -#define ZPCI_PT_ENTRIES (ZPCI_PT_SIZE / ZPCI_TABLE_ENTRY_SIZE) -#define ZPCI_PT_MASK (ZPCI_PT_ENTRIES - 1) - -#define ZPCI_PTE_FLAG_MASK 0xfffULL -#define ZPCI_PTE_ADDR_MASK (~ZPCI_PTE_FLAG_MASK) - -/* Shared bits */ -#define ZPCI_TABLE_VALID 0x00 -#define ZPCI_TABLE_INVALID 0x20 -#define ZPCI_TABLE_PROTECTED 0x200 -#define ZPCI_TABLE_UNPROTECTED 0x000 -#define ZPCI_TABLE_FC 0x400 - -#define ZPCI_TABLE_VALID_MASK 0x20 -#define ZPCI_TABLE_PROT_MASK 0x200 - -#define ZPCI_ETT_RT 1 -#define ZPCI_ETT_ST 0 -#define ZPCI_ETT_PT -1 - -/* PCI Function States - * - * reserved: default; device has just been plugged or is in progress of being - * unplugged - * standby: device is present but not configured; transition from any - * configured state/to this state via sclp configure/deconfigure - * - * The following states make up the "configured" meta-state: - * disabled: device is configured but not enabled; transition between this - * state and enabled via clp enable/disable - * enbaled: device is ready for use; transition to disabled via clp disable; - * may enter an error state - * blocked: ignore all DMA and interrupts; transition back to enabled or from - * error state via mpcifc - * error: an error occurred; transition back to enabled via mpcifc - * permanent error: an unrecoverable error occurred; transition to standby via - * sclp deconfigure - */ -typedef enum { - ZPCI_FS_RESERVED, - ZPCI_FS_STANDBY, - ZPCI_FS_DISABLED, - ZPCI_FS_ENABLED, - ZPCI_FS_BLOCKED, - ZPCI_FS_ERROR, - ZPCI_FS_PERMANENT_ERROR, -} ZpciState; - -typedef struct SeiContainer { - QTAILQ_ENTRY(SeiContainer) link; - uint32_t fid; - uint32_t fh; - uint8_t cc; - uint16_t pec; - uint64_t faddr; - uint32_t e; -} SeiContainer; - -typedef struct PciCcdfErr { - uint32_t reserved1; - uint32_t fh; - uint32_t fid; - uint32_t e; - uint64_t faddr; - uint32_t reserved3; - uint16_t reserved4; - uint16_t pec; -} QEMU_PACKED PciCcdfErr; - -typedef struct PciCcdfAvail { - uint32_t reserved1; - uint32_t fh; - uint32_t fid; - uint32_t reserved2; - uint32_t reserved3; - uint32_t reserved4; - uint32_t reserved5; - uint16_t reserved6; - uint16_t pec; -} QEMU_PACKED PciCcdfAvail; - -typedef struct ChscSeiNt2Res { - uint16_t length; - uint16_t code; - uint16_t reserved1; - uint8_t reserved2; - uint8_t nt; - uint8_t flags; - uint8_t reserved3; - uint8_t reserved4; - uint8_t cc; - uint32_t reserved5[13]; - uint8_t ccdf[4016]; -} QEMU_PACKED ChscSeiNt2Res; - -typedef struct S390MsixInfo { - uint8_t table_bar; - uint8_t pba_bar; - uint16_t entries; - uint32_t table_offset; - uint32_t pba_offset; -} S390MsixInfo; - -typedef struct S390IOTLBEntry { - uint64_t iova; - uint64_t translated_addr; - uint64_t len; - uint64_t perm; -} S390IOTLBEntry; - -struct S390PCIIOMMU { - Object parent_obj; - S390PCIBusDevice *pbdev; - AddressSpace as; - MemoryRegion mr; - IOMMUMemoryRegion iommu_mr; - bool enabled; - uint64_t g_iota; - uint64_t pba; - uint64_t pal; - GHashTable *iotlb; -}; - -typedef struct S390PCIIOMMUTable { - uint64_t key; - S390PCIIOMMU *iommu[PCI_SLOT_MAX]; -} S390PCIIOMMUTable; - -/* Function Measurement Block */ -#define DEFAULT_MUI 4000 -#define UPDATE_U_BIT 0x1ULL -#define FMBK_MASK 0xfULL - -typedef struct ZpciFmbFmt0 { - uint64_t dma_rbytes; - uint64_t dma_wbytes; -} ZpciFmbFmt0; - -#define ZPCI_FMB_CNT_LD 0 -#define ZPCI_FMB_CNT_ST 1 -#define ZPCI_FMB_CNT_STB 2 -#define ZPCI_FMB_CNT_RPCIT 3 -#define ZPCI_FMB_CNT_MAX 4 - -#define ZPCI_FMB_FORMAT 0 - -typedef struct ZpciFmb { - uint32_t format; - uint32_t sample; - uint64_t last_update; - uint64_t counter[ZPCI_FMB_CNT_MAX]; - ZpciFmbFmt0 fmt0; -} ZpciFmb; -QEMU_BUILD_BUG_MSG(offsetof(ZpciFmb, fmt0) != 48, "padding in ZpciFmb"); - -struct S390PCIBusDevice { - DeviceState qdev; - PCIDevice *pdev; - ZpciState state; - char *target; - uint16_t uid; - uint32_t idx; - uint32_t fh; - uint32_t fid; - bool fid_defined; - uint64_t fmb_addr; - ZpciFmb fmb; - QEMUTimer *fmb_timer; - uint8_t isc; - uint16_t noi; - uint16_t maxstbl; - uint8_t sum; - S390MsixInfo msix; - AdapterRoutes routes; - S390PCIIOMMU *iommu; - MemoryRegion msix_notify_mr; - IndAddr *summary_ind; - IndAddr *indicator; - bool pci_unplug_request_processed; - bool unplug_requested; - QTAILQ_ENTRY(S390PCIBusDevice) link; -}; - -struct S390PCIBus { - BusState qbus; -}; - -struct S390pciState { - PCIHostState parent_obj; - uint32_t next_idx; - int bus_no; - S390PCIBus *bus; - GHashTable *iommu_table; - GHashTable *zpci_table; - QTAILQ_HEAD(, SeiContainer) pending_sei; - QTAILQ_HEAD(, S390PCIBusDevice) zpci_devs; -}; - -S390pciState *s390_get_phb(void); -int pci_chsc_sei_nt2_get_event(void *res); -int pci_chsc_sei_nt2_have_event(void); -void s390_pci_sclp_configure(SCCB *sccb); -void s390_pci_sclp_deconfigure(SCCB *sccb); -void s390_pci_iommu_enable(S390PCIIOMMU *iommu); -void s390_pci_iommu_disable(S390PCIIOMMU *iommu); -void s390_pci_generate_error_event(uint16_t pec, uint32_t fh, uint32_t fid, - uint64_t faddr, uint32_t e); -uint16_t s390_guest_io_table_walk(uint64_t g_iota, hwaddr addr, - S390IOTLBEntry *entry); -S390PCIBusDevice *s390_pci_find_dev_by_idx(S390pciState *s, uint32_t idx); -S390PCIBusDevice *s390_pci_find_dev_by_fh(S390pciState *s, uint32_t fh); -S390PCIBusDevice *s390_pci_find_dev_by_fid(S390pciState *s, uint32_t fid); -S390PCIBusDevice *s390_pci_find_dev_by_target(S390pciState *s, - const char *target); -S390PCIBusDevice *s390_pci_find_next_avail_dev(S390pciState *s, - S390PCIBusDevice *pbdev); - -#endif diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 2f7a7d7..639b13c 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -13,12 +13,12 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "s390-pci-inst.h" -#include "s390-pci-bus.h" #include "exec/memop.h" #include "exec/memory-internal.h" #include "qemu/error-report.h" #include "sysemu/hw_accel.h" +#include "hw/s390x/s390-pci-inst.h" +#include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/tod.h" #ifndef DEBUG_S390PCI_INST diff --git a/hw/s390x/s390-pci-inst.h b/hw/s390x/s390-pci-inst.h deleted file mode 100644 index fa3bf8b..0000000 --- a/hw/s390x/s390-pci-inst.h +++ /dev/null @@ -1,312 +0,0 @@ -/* - * s390 PCI instruction definitions - * - * Copyright 2014 IBM Corp. - * Author(s): Frank Blaschka - * Hong Bo Li - * Yi Min Zhao - * - * This work is licensed under the terms of the GNU GPL, version 2 or (at - * your option) any later version. See the COPYING file in the top-level - * directory. - */ - -#ifndef HW_S390_PCI_INST_H -#define HW_S390_PCI_INST_H - -#include "s390-pci-bus.h" -#include "sysemu/dma.h" - -/* CLP common request & response block size */ -#define CLP_BLK_SIZE 4096 -#define PCI_BAR_COUNT 6 -#define PCI_MAX_FUNCTIONS 4096 - -typedef struct ClpReqHdr { - uint16_t len; - uint16_t cmd; -} QEMU_PACKED ClpReqHdr; - -typedef struct ClpRspHdr { - uint16_t len; - uint16_t rsp; -} QEMU_PACKED ClpRspHdr; - -/* CLP Response Codes */ -#define CLP_RC_OK 0x0010 /* Command request successfully */ -#define CLP_RC_CMD 0x0020 /* Command code not recognized */ -#define CLP_RC_PERM 0x0030 /* Command not authorized */ -#define CLP_RC_FMT 0x0040 /* Invalid command request format */ -#define CLP_RC_LEN 0x0050 /* Invalid command request length */ -#define CLP_RC_8K 0x0060 /* Command requires 8K LPCB */ -#define CLP_RC_RESNOT0 0x0070 /* Reserved field not zero */ -#define CLP_RC_NODATA 0x0080 /* No data available */ -#define CLP_RC_FC_UNKNOWN 0x0100 /* Function code not recognized */ - -/* - * Call Logical Processor - Command Codes - */ -#define CLP_LIST_PCI 0x0002 -#define CLP_QUERY_PCI_FN 0x0003 -#define CLP_QUERY_PCI_FNGRP 0x0004 -#define CLP_SET_PCI_FN 0x0005 - -/* PCI function handle list entry */ -typedef struct ClpFhListEntry { - uint16_t device_id; - uint16_t vendor_id; -#define CLP_FHLIST_MASK_CONFIG 0x80000000 - uint32_t config; - uint32_t fid; - uint32_t fh; -} QEMU_PACKED ClpFhListEntry; - -#define CLP_RC_SETPCIFN_FH 0x0101 /* Invalid PCI fn handle */ -#define CLP_RC_SETPCIFN_FHOP 0x0102 /* Fn handle not valid for op */ -#define CLP_RC_SETPCIFN_DMAAS 0x0103 /* Invalid DMA addr space */ -#define CLP_RC_SETPCIFN_RES 0x0104 /* Insufficient resources */ -#define CLP_RC_SETPCIFN_ALRDY 0x0105 /* Fn already in requested state */ -#define CLP_RC_SETPCIFN_ERR 0x0106 /* Fn in permanent error state */ -#define CLP_RC_SETPCIFN_RECPND 0x0107 /* Error recovery pending */ -#define CLP_RC_SETPCIFN_BUSY 0x0108 /* Fn busy */ -#define CLP_RC_LISTPCI_BADRT 0x010a /* Resume token not recognized */ -#define CLP_RC_QUERYPCIFG_PFGID 0x010b /* Unrecognized PFGID */ - -/* request or response block header length */ -#define LIST_PCI_HDR_LEN 32 - -/* Number of function handles fitting in response block */ -#define CLP_FH_LIST_NR_ENTRIES \ - ((CLP_BLK_SIZE - 2 * LIST_PCI_HDR_LEN) \ - / sizeof(ClpFhListEntry)) - -#define CLP_SET_ENABLE_PCI_FN 0 /* Yes, 0 enables it */ -#define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */ - -#define CLP_UTIL_STR_LEN 64 - -#define CLP_MASK_FMT 0xf0000000 - -/* List PCI functions request */ -typedef struct ClpReqListPci { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint64_t resume_token; - uint64_t reserved2; -} QEMU_PACKED ClpReqListPci; - -/* List PCI functions response */ -typedef struct ClpRspListPci { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint64_t resume_token; - uint32_t mdd; - uint16_t max_fn; - uint8_t flags; - uint8_t entry_size; - ClpFhListEntry fh_list[CLP_FH_LIST_NR_ENTRIES]; -} QEMU_PACKED ClpRspListPci; - -/* Query PCI function request */ -typedef struct ClpReqQueryPci { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint32_t fh; /* function handle */ - uint32_t reserved2; - uint64_t reserved3; -} QEMU_PACKED ClpReqQueryPci; - -/* Query PCI function response */ -typedef struct ClpRspQueryPci { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint16_t vfn; /* virtual fn number */ -#define CLP_RSP_QPCI_MASK_UTIL 0x100 -#define CLP_RSP_QPCI_MASK_PFGID 0xff - uint16_t ug; - uint32_t fid; /* pci function id */ - uint8_t bar_size[PCI_BAR_COUNT]; - uint16_t pchid; - uint32_t bar[PCI_BAR_COUNT]; - uint64_t reserved2; - uint64_t sdma; /* start dma as */ - uint64_t edma; /* end dma as */ - uint32_t reserved3[11]; - uint32_t uid; - uint8_t util_str[CLP_UTIL_STR_LEN]; /* utility string */ -} QEMU_PACKED ClpRspQueryPci; - -/* Query PCI function group request */ -typedef struct ClpReqQueryPciGrp { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; -#define CLP_REQ_QPCIG_MASK_PFGID 0xff - uint32_t g; - uint32_t reserved2; - uint64_t reserved3; -} QEMU_PACKED ClpReqQueryPciGrp; - -/* Query PCI function group response */ -typedef struct ClpRspQueryPciGrp { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; -#define CLP_RSP_QPCIG_MASK_NOI 0xfff - uint16_t i; - uint8_t version; -#define CLP_RSP_QPCIG_MASK_FRAME 0x2 -#define CLP_RSP_QPCIG_MASK_REFRESH 0x1 - uint8_t fr; - uint16_t maxstbl; - uint16_t mui; - uint64_t reserved3; - uint64_t dasm; /* dma address space mask */ - uint64_t msia; /* MSI address */ - uint64_t reserved4; - uint64_t reserved5; -} QEMU_PACKED ClpRspQueryPciGrp; - -/* Set PCI function request */ -typedef struct ClpReqSetPci { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint32_t fh; /* function handle */ - uint16_t reserved2; - uint8_t oc; /* operation controls */ - uint8_t ndas; /* number of dma spaces */ - uint64_t reserved3; -} QEMU_PACKED ClpReqSetPci; - -/* Set PCI function response */ -typedef struct ClpRspSetPci { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint32_t fh; /* function handle */ - uint32_t reserved3; - uint64_t reserved4; -} QEMU_PACKED ClpRspSetPci; - -typedef struct ClpReqRspListPci { - ClpReqListPci request; - ClpRspListPci response; -} QEMU_PACKED ClpReqRspListPci; - -typedef struct ClpReqRspSetPci { - ClpReqSetPci request; - ClpRspSetPci response; -} QEMU_PACKED ClpReqRspSetPci; - -typedef struct ClpReqRspQueryPci { - ClpReqQueryPci request; - ClpRspQueryPci response; -} QEMU_PACKED ClpReqRspQueryPci; - -typedef struct ClpReqRspQueryPciGrp { - ClpReqQueryPciGrp request; - ClpRspQueryPciGrp response; -} QEMU_PACKED ClpReqRspQueryPciGrp; - -/* Load/Store status codes */ -#define ZPCI_PCI_ST_FUNC_NOT_ENABLED 4 -#define ZPCI_PCI_ST_FUNC_IN_ERR 8 -#define ZPCI_PCI_ST_BLOCKED 12 -#define ZPCI_PCI_ST_INSUF_RES 16 -#define ZPCI_PCI_ST_INVAL_AS 20 -#define ZPCI_PCI_ST_FUNC_ALREADY_ENABLED 24 -#define ZPCI_PCI_ST_DMA_AS_NOT_ENABLED 28 -#define ZPCI_PCI_ST_2ND_OP_IN_INV_AS 36 -#define ZPCI_PCI_ST_FUNC_NOT_AVAIL 40 -#define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE 44 - -/* Load/Store return codes */ -#define ZPCI_PCI_LS_OK 0 -#define ZPCI_PCI_LS_ERR 1 -#define ZPCI_PCI_LS_BUSY 2 -#define ZPCI_PCI_LS_INVAL_HANDLE 3 - -/* Modify PCI status codes */ -#define ZPCI_MOD_ST_RES_NOT_AVAIL 4 -#define ZPCI_MOD_ST_INSUF_RES 16 -#define ZPCI_MOD_ST_SEQUENCE 24 -#define ZPCI_MOD_ST_DMAAS_INVAL 28 -#define ZPCI_MOD_ST_FRAME_INVAL 32 -#define ZPCI_MOD_ST_ERROR_RECOVER 40 - -/* Modify PCI Function Controls */ -#define ZPCI_MOD_FC_REG_INT 2 -#define ZPCI_MOD_FC_DEREG_INT 3 -#define ZPCI_MOD_FC_REG_IOAT 4 -#define ZPCI_MOD_FC_DEREG_IOAT 5 -#define ZPCI_MOD_FC_REREG_IOAT 6 -#define ZPCI_MOD_FC_RESET_ERROR 7 -#define ZPCI_MOD_FC_RESET_BLOCK 9 -#define ZPCI_MOD_FC_SET_MEASURE 10 - -/* Store PCI Function Controls status codes */ -#define ZPCI_STPCIFC_ST_PERM_ERROR 8 -#define ZPCI_STPCIFC_ST_INVAL_DMAAS 28 -#define ZPCI_STPCIFC_ST_ERROR_RECOVER 40 - -/* FIB function controls */ -#define ZPCI_FIB_FC_ENABLED 0x80 -#define ZPCI_FIB_FC_ERROR 0x40 -#define ZPCI_FIB_FC_LS_BLOCKED 0x20 -#define ZPCI_FIB_FC_DMAAS_REG 0x10 - -/* FIB function controls */ -#define ZPCI_FIB_FC_ENABLED 0x80 -#define ZPCI_FIB_FC_ERROR 0x40 -#define ZPCI_FIB_FC_LS_BLOCKED 0x20 -#define ZPCI_FIB_FC_DMAAS_REG 0x10 - -/* Function Information Block */ -typedef struct ZpciFib { - uint8_t fmt; /* format */ - uint8_t reserved1[7]; - uint8_t fc; /* function controls */ - uint8_t reserved2; - uint16_t reserved3; - uint32_t reserved4; - uint64_t pba; /* PCI base address */ - uint64_t pal; /* PCI address limit */ - uint64_t iota; /* I/O Translation Anchor */ -#define FIB_DATA_ISC(x) (((x) >> 28) & 0x7) -#define FIB_DATA_NOI(x) (((x) >> 16) & 0xfff) -#define FIB_DATA_AIBVO(x) (((x) >> 8) & 0x3f) -#define FIB_DATA_SUM(x) (((x) >> 7) & 0x1) -#define FIB_DATA_AISBO(x) ((x) & 0x3f) - uint32_t data; - uint32_t reserved5; - uint64_t aibv; /* Adapter int bit vector address */ - uint64_t aisb; /* Adapter int summary bit address */ - uint64_t fmb_addr; /* Function measurement address and key */ - uint32_t reserved6; - uint32_t gd; -} QEMU_PACKED ZpciFib; - -int pci_dereg_irqs(S390PCIBusDevice *pbdev); -void pci_dereg_ioat(S390PCIIOMMU *iommu); -int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra); -int pcilg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra); -int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra); -int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra); -int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr, - uint8_t ar, uintptr_t ra); -int mpcifc_service_call(S390CPU *cpu, uint8_t r1, uint64_t fiba, uint8_t ar, - uintptr_t ra); -int stpcifc_service_call(S390CPU *cpu, uint8_t r1, uint64_t fiba, uint8_t ar, - uintptr_t ra); -void fmb_timer_free(S390PCIBusDevice *pbdev); - -#define ZPCI_IO_BAR_MIN 0 -#define ZPCI_IO_BAR_MAX 5 -#define ZPCI_CONFIG_BAR 15 - -#endif diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 2e90033..22222c4 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -28,7 +28,7 @@ #include "qemu/error-report.h" #include "qemu/option.h" #include "qemu/qemu-print.h" -#include "s390-pci-bus.h" +#include "hw/s390x/s390-pci-bus.h" #include "sysemu/reset.h" #include "hw/s390x/storage-keys.h" #include "hw/s390x/storage-attributes.h" -- cgit v1.1 From 3ab7a0b40d4be5ade3b61d4afd1518193b199423 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:32 -0400 Subject: vfio: Create shared routine for scanning info capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than duplicating the same loop in multiple locations, create a static function to do the work. Signed-off-by: Matthew Rosato Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/common.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d41ba67..693d3a2 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1125,17 +1125,12 @@ static void vfio_listener_release(VFIOContainer *container) } } -struct vfio_info_cap_header * -vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) +static struct vfio_info_cap_header * +vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) { struct vfio_info_cap_header *hdr; - void *ptr = info; - if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { - return NULL; - } - - for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) { if (hdr->id == id) { return hdr; } @@ -1144,6 +1139,16 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) return NULL; } +struct vfio_info_cap_header * +vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) +{ + if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { + return NULL; + } + + return vfio_get_cap((void *)info, info->cap_offset, id); +} + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, struct vfio_region_info *info) { -- cgit v1.1 From 7486a62845b1e12011dd99973e4739f69d57cd38 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:33 -0400 Subject: vfio: Find DMA available capability The underlying host may be limiting the number of outstanding DMA requests for type 1 IOMMU. Add helper functions to check for the DMA available capability and retrieve the current number of DMA mappings allowed. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck [aw: vfio_get_info_dma_avail moved inside CONFIG_LINUX] Signed-off-by: Alex Williamson --- hw/vfio/common.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 693d3a2..920786a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1149,6 +1149,37 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) return vfio_get_cap((void *)info, info->cap_offset, id); } +static struct vfio_info_cap_header * +vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +{ + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; + } + + return vfio_get_cap((void *)info, info->cap_offset, id); +} + +bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_dma_avail *cap; + + /* If the capability cannot be found, assume no DMA limiting */ + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); + if (hdr == NULL) { + return false; + } + + if (avail != NULL) { + cap = (void *) hdr; + *avail = cap->avail; + } + + return true; +} + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, struct vfio_region_info *info) { -- cgit v1.1 From cd7498d07fbb20fa04790ff7ee168a8a8d01cb30 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:34 -0400 Subject: s390x/pci: Add routine to get the vfio dma available count Create new files for separating out vfio-specific work for s390 pci. Add the first such routine, which issues VFIO_IOMMU_GET_INFO ioctl to collect the current dma available count. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck [aw: Fix non-Linux build with CONFIG_LINUX] Signed-off-by: Alex Williamson --- hw/s390x/meson.build | 1 + hw/s390x/s390-pci-vfio.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 hw/s390x/s390-pci-vfio.c (limited to 'hw') diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build index 948ceae..f4663a8 100644 --- a/hw/s390x/meson.build +++ b/hw/s390x/meson.build @@ -27,6 +27,7 @@ s390x_ss.add(when: 'CONFIG_KVM', if_true: files( )) s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files('s390-virtio-ccw.c')) s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c')) +s390x_ss.add(when: 'CONFIG_LINUX', if_true: files('s390-pci-vfio.c')) virtio_ss = ss.source_set() virtio_ss.add(files('virtio-ccw.c')) diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c new file mode 100644 index 0000000..cb3f4d9 --- /dev/null +++ b/hw/s390x/s390-pci-vfio.c @@ -0,0 +1,54 @@ +/* + * s390 vfio-pci interfaces + * + * Copyright 2020 IBM Corp. + * Author(s): Matthew Rosato + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include + +#include "qemu/osdep.h" +#include "hw/s390x/s390-pci-vfio.h" +#include "hw/vfio/vfio-common.h" + +/* + * Get the current DMA available count from vfio. Returns true if vfio is + * limiting DMA requests, false otherwise. The current available count read + * from vfio is returned in avail. + */ +bool s390_pci_update_dma_avail(int fd, unsigned int *avail) +{ + g_autofree struct vfio_iommu_type1_info *info; + uint32_t argsz; + + assert(avail); + + argsz = sizeof(struct vfio_iommu_type1_info); + info = g_malloc0(argsz); + + /* + * If the specified argsz is not large enough to contain all capabilities + * it will be updated upon return from the ioctl. Retry until we have + * a big enough buffer to hold the entire capability chain. + */ +retry: + info->argsz = argsz; + + if (ioctl(fd, VFIO_IOMMU_GET_INFO, info)) { + return false; + } + + if (info->argsz > argsz) { + argsz = info->argsz; + info = g_realloc(info, argsz); + goto retry; + } + + /* If the capability exists, update with the current value */ + return vfio_get_info_dma_avail(info, avail); +} + -- cgit v1.1 From 37fa32de707340f3a93959ad5a1ebc41ba1520ee Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:35 -0400 Subject: s390x/pci: Honor DMA limits set by vfio When an s390 guest is using lazy unmapping, it can result in a very large number of oustanding DMA requests, far beyond the default limit configured for vfio. Let's track DMA usage similar to vfio in the host, and trigger the guest to flush their DMA mappings before vfio runs out. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck [aw: non-Linux build fixes] Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 16 +++++++++++----- hw/s390x/s390-pci-inst.c | 45 +++++++++++++++++++++++++++++++++++++++------ hw/s390x/s390-pci-vfio.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 11 deletions(-) (limited to 'hw') diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index a929340..2187173 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -17,6 +17,7 @@ #include "cpu.h" #include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/s390-pci-inst.h" +#include "hw/s390x/s390-pci-vfio.h" #include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" #include "hw/pci/pci_bridge.h" @@ -764,6 +765,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) s->bus_no = 0; QTAILQ_INIT(&s->pending_sei); QTAILQ_INIT(&s->zpci_devs); + QTAILQ_INIT(&s->zpci_dma_limit); css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false, S390_ADAPTER_SUPPRESSIBLE, errp); @@ -941,17 +943,18 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, } } + pbdev->pdev = pdev; + pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); + pbdev->iommu->pbdev = pbdev; + pbdev->state = ZPCI_FS_DISABLED; + if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { pbdev->fh |= FH_SHM_VFIO; + pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); } else { pbdev->fh |= FH_SHM_EMUL; } - pbdev->pdev = pdev; - pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); - pbdev->iommu->pbdev = pbdev; - pbdev->state = ZPCI_FS_DISABLED; - if (s390_pci_msix_init(pbdev)) { error_setg(errp, "MSI-X support is mandatory " "in the S390 architecture"); @@ -1004,6 +1007,9 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, pbdev->fid = 0; QTAILQ_REMOVE(&s->zpci_devs, pbdev, link); g_hash_table_remove(s->zpci_table, &pbdev->idx); + if (pbdev->iommu->dma_limit) { + s390_pci_end_dma_count(s, pbdev->iommu->dma_limit); + } qdev_unrealize(dev); } } diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 639b13c..4eadd9e 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -32,6 +32,20 @@ } \ } while (0) +static inline void inc_dma_avail(S390PCIIOMMU *iommu) +{ + if (iommu->dma_limit) { + iommu->dma_limit->avail++; + } +} + +static inline void dec_dma_avail(S390PCIIOMMU *iommu) +{ + if (iommu->dma_limit) { + iommu->dma_limit->avail--; + } +} + static void s390_set_status_code(CPUS390XState *env, uint8_t r, uint64_t status_code) { @@ -572,7 +586,8 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) return 0; } -static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) +static uint32_t s390_pci_update_iotlb(S390PCIIOMMU *iommu, + S390IOTLBEntry *entry) { S390IOTLBEntry *cache = g_hash_table_lookup(iommu->iotlb, &entry->iova); IOMMUTLBEntry notify = { @@ -585,14 +600,15 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) if (entry->perm == IOMMU_NONE) { if (!cache) { - return; + goto out; } g_hash_table_remove(iommu->iotlb, &entry->iova); + inc_dma_avail(iommu); } else { if (cache) { if (cache->perm == entry->perm && cache->translated_addr == entry->translated_addr) { - return; + goto out; } notify.perm = IOMMU_NONE; @@ -606,9 +622,13 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) cache->len = PAGE_SIZE; cache->perm = entry->perm; g_hash_table_replace(iommu->iotlb, &cache->iova, cache); + dec_dma_avail(iommu); } memory_region_notify_iommu(&iommu->iommu_mr, 0, notify); + +out: + return iommu->dma_limit ? iommu->dma_limit->avail : 1; } int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) @@ -620,6 +640,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) S390PCIIOMMU *iommu; S390IOTLBEntry entry; hwaddr start, end; + uint32_t dma_avail; if (env->psw.mask & PSW_MASK_PSTATE) { s390_program_interrupt(env, PGM_PRIVILEGED, ra); @@ -658,6 +679,11 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) } iommu = pbdev->iommu; + if (iommu->dma_limit) { + dma_avail = iommu->dma_limit->avail; + } else { + dma_avail = 1; + } if (!iommu->g_iota) { error = ERR_EVENT_INVALAS; goto err; @@ -675,8 +701,9 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) } start += entry.len; - while (entry.iova < start && entry.iova < end) { - s390_pci_update_iotlb(iommu, &entry); + while (entry.iova < start && entry.iova < end && + (dma_avail > 0 || entry.perm == IOMMU_NONE)) { + dma_avail = s390_pci_update_iotlb(iommu, &entry); entry.iova += PAGE_SIZE; entry.translated_addr += PAGE_SIZE; } @@ -689,7 +716,13 @@ err: s390_pci_generate_error_event(error, pbdev->fh, pbdev->fid, start, 0); } else { pbdev->fmb.counter[ZPCI_FMB_CNT_RPCIT]++; - setcc(cpu, ZPCI_PCI_LS_OK); + if (dma_avail > 0) { + setcc(cpu, ZPCI_PCI_LS_OK); + } else { + /* vfio DMA mappings are exhausted, trigger a RPCIT */ + setcc(cpu, ZPCI_PCI_LS_ERR); + s390_set_status_code(env, r1, ZPCI_RPCIT_ST_INSUFF_RES); + } } return 0; } diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c index cb3f4d9..0621fa3 100644 --- a/hw/s390x/s390-pci-vfio.c +++ b/hw/s390x/s390-pci-vfio.c @@ -12,7 +12,9 @@ #include #include "qemu/osdep.h" +#include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/s390-pci-vfio.h" +#include "hw/vfio/pci.h" #include "hw/vfio/vfio-common.h" /* @@ -52,3 +54,43 @@ retry: return vfio_get_info_dma_avail(info, avail); } +S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, + S390PCIBusDevice *pbdev) +{ + S390PCIDMACount *cnt; + uint32_t avail; + VFIOPCIDevice *vpdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + int id; + + assert(vpdev); + + id = vpdev->vbasedev.group->container->fd; + + if (!s390_pci_update_dma_avail(id, &avail)) { + return NULL; + } + + QTAILQ_FOREACH(cnt, &s->zpci_dma_limit, link) { + if (cnt->id == id) { + cnt->users++; + return cnt; + } + } + + cnt = g_new0(S390PCIDMACount, 1); + cnt->id = id; + cnt->users = 1; + cnt->avail = avail; + QTAILQ_INSERT_TAIL(&s->zpci_dma_limit, cnt, link); + return cnt; +} + +void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) +{ + assert(cnt); + + cnt->users--; + if (cnt->users == 0) { + QTAILQ_REMOVE(&s->zpci_dma_limit, cnt, link); + } +} -- cgit v1.1 From 28dc86a07299fba784ca2352f95e30fe603e17ab Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Mon, 26 Oct 2020 11:34:37 -0400 Subject: s390x/pci: use a PCI Group structure We use a S390PCIGroup structure to hold the information related to a zPCI Function group. This allows us to be ready to support multiple groups and to retrieve the group information from the host. Signed-off-by: Pierre Morel Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 42 ++++++++++++++++++++++++++++++++++++++++++ hw/s390x/s390-pci-inst.c | 23 ++++++++++++++--------- 2 files changed, 56 insertions(+), 9 deletions(-) (limited to 'hw') diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 2187173..4c7f06d 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -738,6 +738,46 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn) object_unref(OBJECT(iommu)); } +static S390PCIGroup *s390_group_create(int id) +{ + S390PCIGroup *group; + S390pciState *s = s390_get_phb(); + + group = g_new0(S390PCIGroup, 1); + group->id = id; + QTAILQ_INSERT_TAIL(&s->zpci_groups, group, link); + return group; +} + +S390PCIGroup *s390_group_find(int id) +{ + S390PCIGroup *group; + S390pciState *s = s390_get_phb(); + + QTAILQ_FOREACH(group, &s->zpci_groups, link) { + if (group->id == id) { + return group; + } + } + return NULL; +} + +static void s390_pci_init_default_group(void) +{ + S390PCIGroup *group; + ClpRspQueryPciGrp *resgrp; + + group = s390_group_create(ZPCI_DEFAULT_FN_GRP); + resgrp = &group->zpci_group; + resgrp->fr = 1; + stq_p(&resgrp->dasm, 0); + stq_p(&resgrp->msia, ZPCI_MSI_ADDR); + stw_p(&resgrp->mui, DEFAULT_MUI); + stw_p(&resgrp->i, 128); + stw_p(&resgrp->maxstbl, 128); + resgrp->version = 0; +} + static void s390_pcihost_realize(DeviceState *dev, Error **errp) { PCIBus *b; @@ -766,7 +806,9 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) QTAILQ_INIT(&s->pending_sei); QTAILQ_INIT(&s->zpci_devs); QTAILQ_INIT(&s->zpci_dma_limit); + QTAILQ_INIT(&s->zpci_groups); + s390_pci_init_default_group(); css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false, S390_ADAPTER_SUPPRESSIBLE, errp); } diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 4eadd9e..c25b2a6 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -298,21 +298,25 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) stq_p(&resquery->edma, ZPCI_EDMA_ADDR); stl_p(&resquery->fid, pbdev->fid); stw_p(&resquery->pchid, 0); - stw_p(&resquery->ug, 1); + stw_p(&resquery->ug, ZPCI_DEFAULT_FN_GRP); stl_p(&resquery->uid, pbdev->uid); stw_p(&resquery->hdr.rsp, CLP_RC_OK); break; } case CLP_QUERY_PCI_FNGRP: { ClpRspQueryPciGrp *resgrp = (ClpRspQueryPciGrp *)resh; - resgrp->fr = 1; - stq_p(&resgrp->dasm, 0); - stq_p(&resgrp->msia, ZPCI_MSI_ADDR); - stw_p(&resgrp->mui, DEFAULT_MUI); - stw_p(&resgrp->i, 128); - stw_p(&resgrp->maxstbl, 128); - resgrp->version = 0; + ClpReqQueryPciGrp *reqgrp = (ClpReqQueryPciGrp *)reqh; + S390PCIGroup *group; + + group = s390_group_find(reqgrp->g); + if (!group) { + /* We do not allow access to unknown groups */ + /* The group must have been obtained with a vfio device */ + stw_p(&resgrp->hdr.rsp, CLP_RC_QUERYPCIFG_PFGID); + goto out; + } + memcpy(resgrp, &group->zpci_group, sizeof(ClpRspQueryPciGrp)); stw_p(&resgrp->hdr.rsp, CLP_RC_OK); break; } @@ -787,7 +791,8 @@ int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr, } /* Length must be greater than 8, a multiple of 8 */ /* and not greater than maxstbl */ - if ((len <= 8) || (len % 8) || (len > pbdev->maxstbl)) { + if ((len <= 8) || (len % 8) || + (len > pbdev->pci_group->zpci_group.maxstbl)) { goto specification_error; } /* Do not cross a 4K-byte boundary */ -- cgit v1.1 From b354d5d8049c513444b51ce841bd3136fed2e234 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:38 -0400 Subject: s390x/pci: clean up s390 PCI groups Add a step to remove all stashed PCI groups to avoid stale data between machine resets. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'hw') diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 4c7f06d..036cf46 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -813,6 +813,17 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) S390_ADAPTER_SUPPRESSIBLE, errp); } +static void s390_pcihost_unrealize(DeviceState *dev) +{ + S390PCIGroup *group; + S390pciState *s = S390_PCI_HOST_BRIDGE(dev); + + while (!QTAILQ_EMPTY(&s->zpci_groups)) { + group = QTAILQ_FIRST(&s->zpci_groups); + QTAILQ_REMOVE(&s->zpci_groups, group, link); + } +} + static int s390_pci_msix_init(S390PCIBusDevice *pbdev) { char *name; @@ -1171,6 +1182,7 @@ static void s390_pcihost_class_init(ObjectClass *klass, void *data) dc->reset = s390_pcihost_reset; dc->realize = s390_pcihost_realize; + dc->unrealize = s390_pcihost_unrealize; hc->pre_plug = s390_pcihost_pre_plug; hc->plug = s390_pcihost_plug; hc->unplug_request = s390_pcihost_unplug_request; -- cgit v1.1 From 9670ee752727945d8ce4f76efc0b68364b832f20 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Mon, 26 Oct 2020 11:34:39 -0400 Subject: s390x/pci: use a PCI Function structure We use a ClpRspQueryPci structure to hold the information related to a zPCI Function. This allows us to be ready to support different zPCI functions and to retrieve the zPCI function information from the host. Signed-off-by: Pierre Morel Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 12 ++++++++++++ hw/s390x/s390-pci-inst.c | 8 ++------ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'hw') diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 036cf46..072b56e 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -778,6 +778,17 @@ static void s390_pci_init_default_group(void) resgrp->version = 0; } +static void set_pbdev_info(S390PCIBusDevice *pbdev) +{ + pbdev->zpci_fn.sdma = ZPCI_SDMA_ADDR; + pbdev->zpci_fn.edma = ZPCI_EDMA_ADDR; + pbdev->zpci_fn.pchid = 0; + pbdev->zpci_fn.ug = ZPCI_DEFAULT_FN_GRP; + pbdev->zpci_fn.fid = pbdev->fid; + pbdev->zpci_fn.uid = pbdev->uid; + pbdev->pci_group = s390_group_find(ZPCI_DEFAULT_FN_GRP); +} + static void s390_pcihost_realize(DeviceState *dev, Error **errp) { PCIBus *b; @@ -1000,6 +1011,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); pbdev->iommu->pbdev = pbdev; pbdev->state = ZPCI_FS_DISABLED; + set_pbdev_info(pbdev); if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { pbdev->fh |= FH_SHM_VFIO; diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index c25b2a6..58cd041 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -281,6 +281,8 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) goto out; } + memcpy(resquery, &pbdev->zpci_fn, sizeof(*resquery)); + for (i = 0; i < PCI_BAR_COUNT; i++) { uint32_t data = pci_get_long(pbdev->pdev->config + PCI_BASE_ADDRESS_0 + (i * 4)); @@ -294,12 +296,6 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) resquery->bar_size[i]); } - stq_p(&resquery->sdma, ZPCI_SDMA_ADDR); - stq_p(&resquery->edma, ZPCI_EDMA_ADDR); - stl_p(&resquery->fid, pbdev->fid); - stw_p(&resquery->pchid, 0); - stw_p(&resquery->ug, ZPCI_DEFAULT_FN_GRP); - stl_p(&resquery->uid, pbdev->uid); stw_p(&resquery->hdr.rsp, CLP_RC_OK); break; } -- cgit v1.1 From 92fe289ace3e559e2d18d0c2e49cdfb4cbd5a59b Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:40 -0400 Subject: vfio: Add routine for finding VFIO_DEVICE_GET_INFO capabilities Now that VFIO_DEVICE_GET_INFO supports capability chains, add a helper function to find specific capabilities in the chain. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/common.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 920786a..57f55f0 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1159,6 +1159,16 @@ vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) return vfio_get_cap((void *)info, info->cap_offset, id); } +struct vfio_info_cap_header * +vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id) +{ + if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) { + return NULL; + } + + return vfio_get_cap((void *)info, info->cap_offset, id); +} + bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, unsigned int *avail) { -- cgit v1.1 From 1e7552ff5c34972a7a17d2b06900a0b66c79a68b Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:41 -0400 Subject: s390x/pci: get zPCI function info from host We use the capability chains of the VFIO_DEVICE_GET_INFO ioctl to retrieve the CLP information that the kernel exports. To be compatible with previous kernel versions we fall back on previous predefined values, same as the emulation values, when the ioctl is found to not support capability chains. If individual CLP capabilities are not found, we fall back on default values for only those capabilities missing from the chain. This patch is based on work previously done by Pierre Morel. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck [aw: non-Linux build fixes] Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 9 ++- hw/s390x/s390-pci-vfio.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++ hw/s390x/trace-events | 5 ++ 3 files changed, 191 insertions(+), 3 deletions(-) (limited to 'hw') diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 072b56e..48a3be8 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -738,7 +738,7 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn) object_unref(OBJECT(iommu)); } -static S390PCIGroup *s390_group_create(int id) +S390PCIGroup *s390_group_create(int id) { S390PCIGroup *group; S390pciState *s = s390_get_phb(); @@ -783,7 +783,7 @@ static void set_pbdev_info(S390PCIBusDevice *pbdev) pbdev->zpci_fn.sdma = ZPCI_SDMA_ADDR; pbdev->zpci_fn.edma = ZPCI_EDMA_ADDR; pbdev->zpci_fn.pchid = 0; - pbdev->zpci_fn.ug = ZPCI_DEFAULT_FN_GRP; + pbdev->zpci_fn.pfgid = ZPCI_DEFAULT_FN_GRP; pbdev->zpci_fn.fid = pbdev->fid; pbdev->zpci_fn.uid = pbdev->uid; pbdev->pci_group = s390_group_find(ZPCI_DEFAULT_FN_GRP); @@ -863,7 +863,8 @@ static int s390_pci_msix_init(S390PCIBusDevice *pbdev) name = g_strdup_printf("msix-s390-%04x", pbdev->uid); memory_region_init_io(&pbdev->msix_notify_mr, OBJECT(pbdev), &s390_msi_ctrl_ops, pbdev, name, PAGE_SIZE); - memory_region_add_subregion(&pbdev->iommu->mr, ZPCI_MSI_ADDR, + memory_region_add_subregion(&pbdev->iommu->mr, + pbdev->pci_group->zpci_group.msia, &pbdev->msix_notify_mr); g_free(name); @@ -1016,6 +1017,8 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { pbdev->fh |= FH_SHM_VFIO; pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); + /* Fill in CLP information passed via the vfio region */ + s390_pci_get_clp_info(pbdev); } else { pbdev->fh |= FH_SHM_EMUL; } diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c index 0621fa3..d5c7806 100644 --- a/hw/s390x/s390-pci-vfio.c +++ b/hw/s390x/s390-pci-vfio.c @@ -10,9 +10,13 @@ */ #include +#include +#include #include "qemu/osdep.h" +#include "trace.h" #include "hw/s390x/s390-pci-bus.h" +#include "hw/s390x/s390-pci-clp.h" #include "hw/s390x/s390-pci-vfio.h" #include "hw/vfio/pci.h" #include "hw/vfio/vfio-common.h" @@ -94,3 +98,179 @@ void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) QTAILQ_REMOVE(&s->zpci_dma_limit, cnt, link); } } + +static void s390_pci_read_base(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_base *cap; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_BASE); + + /* If capability not provided, just leave the defaults in place */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_BASE); + return; + } + cap = (void *) hdr; + + pbdev->zpci_fn.sdma = cap->start_dma; + pbdev->zpci_fn.edma = cap->end_dma; + pbdev->zpci_fn.pchid = cap->pchid; + pbdev->zpci_fn.vfn = cap->vfn; + pbdev->zpci_fn.pfgid = cap->gid; + /* The following values remain 0 until we support other FMB formats */ + pbdev->zpci_fn.fmbl = 0; + pbdev->zpci_fn.pft = 0; +} + +static void s390_pci_read_group(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_group *cap; + ClpRspQueryPciGrp *resgrp; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_GROUP); + + /* If capability not provided, just use the default group */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_GROUP); + pbdev->zpci_fn.pfgid = ZPCI_DEFAULT_FN_GRP; + pbdev->pci_group = s390_group_find(ZPCI_DEFAULT_FN_GRP); + return; + } + cap = (void *) hdr; + + /* See if the PCI group is already defined, create if not */ + pbdev->pci_group = s390_group_find(pbdev->zpci_fn.pfgid); + + if (!pbdev->pci_group) { + pbdev->pci_group = s390_group_create(pbdev->zpci_fn.pfgid); + + resgrp = &pbdev->pci_group->zpci_group; + if (cap->flags & VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH) { + resgrp->fr = 1; + } + stq_p(&resgrp->dasm, cap->dasm); + stq_p(&resgrp->msia, cap->msi_addr); + stw_p(&resgrp->mui, cap->mui); + stw_p(&resgrp->i, cap->noi); + stw_p(&resgrp->maxstbl, cap->maxstbl); + stb_p(&resgrp->version, cap->version); + } +} + +static void s390_pci_read_util(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_util *cap; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); + + /* If capability not provided, just leave the defaults in place */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); + return; + } + cap = (void *) hdr; + + if (cap->size > CLP_UTIL_STR_LEN) { + trace_s390_pci_clp_cap_size(vpci->vbasedev.name, cap->size, + VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); + return; + } + + pbdev->zpci_fn.flags |= CLP_RSP_QPCI_MASK_UTIL; + memcpy(pbdev->zpci_fn.util_str, cap->util_str, CLP_UTIL_STR_LEN); +} + +static void s390_pci_read_pfip(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_pfip *cap; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); + + /* If capability not provided, just leave the defaults in place */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); + return; + } + cap = (void *) hdr; + + if (cap->size > CLP_PFIP_NR_SEGMENTS) { + trace_s390_pci_clp_cap_size(vpci->vbasedev.name, cap->size, + VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); + return; + } + + memcpy(pbdev->zpci_fn.pfip, cap->pfip, CLP_PFIP_NR_SEGMENTS); +} + +/* + * This function will issue the VFIO_DEVICE_GET_INFO ioctl and look for + * capabilities that contain information about CLP features provided by the + * underlying host. + * On entry, defaults have already been placed into the guest CLP response + * buffers. On exit, defaults will have been overwritten for any CLP features + * found in the capability chain; defaults will remain for any CLP features not + * found in the chain. + */ +void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) +{ + g_autofree struct vfio_device_info *info; + VFIOPCIDevice *vfio_pci; + uint32_t argsz; + int fd; + + argsz = sizeof(*info); + info = g_malloc0(argsz); + + vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + fd = vfio_pci->vbasedev.fd; + + /* + * If the specified argsz is not large enough to contain all capabilities + * it will be updated upon return from the ioctl. Retry until we have + * a big enough buffer to hold the entire capability chain. On error, + * just exit and rely on CLP defaults. + */ +retry: + info->argsz = argsz; + + if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { + trace_s390_pci_clp_dev_info(vfio_pci->vbasedev.name); + return; + } + + if (info->argsz > argsz) { + argsz = info->argsz; + info = g_realloc(info, argsz); + goto retry; + } + + /* + * Find the CLP features provided and fill in the guest CLP responses. + * Always call s390_pci_read_base first as information from this could + * determine which function group is used in s390_pci_read_group. + * For any feature not found, the default values will remain in the CLP + * response. + */ + s390_pci_read_base(pbdev, info); + s390_pci_read_group(pbdev, info); + s390_pci_read_util(pbdev, info); + s390_pci_read_pfip(pbdev, info); + + return; +} diff --git a/hw/s390x/trace-events b/hw/s390x/trace-events index 0dc5b81..8156693 100644 --- a/hw/s390x/trace-events +++ b/hw/s390x/trace-events @@ -14,3 +14,8 @@ css_do_sic(uint16_t mode, uint8_t isc) "CSS: set interruption mode 0x%x on isc 0 virtio_ccw_interpret_ccw(int cssid, int ssid, int schid, int cmd_code) "VIRTIO-CCW: %x.%x.%04x: interpret command 0x%x" virtio_ccw_new_device(int cssid, int ssid, int schid, int devno, const char *devno_mode) "VIRTIO-CCW: add subchannel %x.%x.%04x, devno 0x%04x (%s)" virtio_ccw_set_ind(uint64_t ind_loc, uint8_t ind_old, uint8_t ind_new) "VIRTIO-CCW: indicator at %" PRIu64 ": 0x%x->0x%x" + +# s390-pci-vfio.c +s390_pci_clp_cap(const char *id, uint32_t cap) "PCI: %s: missing expected CLP capability %u" +s390_pci_clp_cap_size(const char *id, uint32_t size, uint32_t cap) "PCI: %s: bad size (%u) for CLP capability %u" +s390_pci_clp_dev_info(const char *id) "PCI: %s: cannot read vfio device info" -- cgit v1.1 From 88eef59796f91271e3d288f64457e975dd7c8ac9 Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Fri, 23 Oct 2020 18:13:42 +0530 Subject: hw/vfio: Use lock guard macros Use qemu LOCK_GUARD macros in hw/vfio. Saves manual unlock calls Signed-off-by: Amey Narkhede Signed-off-by: Alex Williamson --- hw/vfio/platform.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'hw') diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 869ed2c..cc3f66f 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -166,7 +166,7 @@ static void vfio_intp_mmap_enable(void *opaque) VFIOINTp *tmp; VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque; - qemu_mutex_lock(&vdev->intp_mutex); + QEMU_LOCK_GUARD(&vdev->intp_mutex); QLIST_FOREACH(tmp, &vdev->intp_list, next) { if (tmp->state == VFIO_IRQ_ACTIVE) { trace_vfio_platform_intp_mmap_enable(tmp->pin); @@ -174,12 +174,10 @@ static void vfio_intp_mmap_enable(void *opaque) timer_mod(vdev->mmap_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->mmap_timeout); - qemu_mutex_unlock(&vdev->intp_mutex); return; } } vfio_mmap_set_enabled(vdev, true); - qemu_mutex_unlock(&vdev->intp_mutex); } /** @@ -289,7 +287,7 @@ static void vfio_platform_eoi(VFIODevice *vbasedev) VFIOPlatformDevice *vdev = container_of(vbasedev, VFIOPlatformDevice, vbasedev); - qemu_mutex_lock(&vdev->intp_mutex); + QEMU_LOCK_GUARD(&vdev->intp_mutex); QLIST_FOREACH(intp, &vdev->intp_list, next) { if (intp->state == VFIO_IRQ_ACTIVE) { trace_vfio_platform_eoi(intp->pin, @@ -314,7 +312,6 @@ static void vfio_platform_eoi(VFIODevice *vbasedev) vfio_intp_inject_pending_lockheld(intp); QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext); } - qemu_mutex_unlock(&vdev->intp_mutex); } /** -- cgit v1.1 From c624b6b312680b76d2a19a4c65cfdb234e875e1b Mon Sep 17 00:00:00 2001 From: Zhengui li Date: Mon, 19 Oct 2020 14:23:46 +0000 Subject: vfio: fix incorrect print type The type of input variable is unsigned int while the printer type is int. So fix incorrect print type. Signed-off-by: Zhengui li Signed-off-by: Alex Williamson --- hw/vfio/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 57f55f0..e18ea2c 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -205,7 +205,7 @@ void vfio_region_write(void *opaque, hwaddr addr, buf.qword = cpu_to_le64(data); break; default: - hw_error("vfio: unsupported write size, %d bytes", size); + hw_error("vfio: unsupported write size, %u bytes", size); break; } @@ -262,7 +262,7 @@ uint64_t vfio_region_read(void *opaque, data = le64_to_cpu(buf.qword); break; default: - hw_error("vfio: unsupported read size, %d bytes", size); + hw_error("vfio: unsupported read size, %u bytes", size); break; } -- cgit v1.1