diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2025-02-22 05:06:39 +0800 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2025-02-22 05:06:39 +0800 |
commit | b69801dd6b1eb4d107f7c2f643adf0a4e3ec9124 (patch) | |
tree | 9620500ad85d8368314501a3969e20624916db17 /hw | |
parent | f41af4c5857b6983766aaffc041580ff170d0679 (diff) | |
parent | dd6d545e8f2d9a0e8a8c287ec16469f03ef5c198 (diff) | |
download | qemu-b69801dd6b1eb4d107f7c2f643adf0a4e3ec9124.zip qemu-b69801dd6b1eb4d107f7c2f643adf0a4e3ec9124.tar.gz qemu-b69801dd6b1eb4d107f7c2f643adf0a4e3ec9124.tar.bz2 |
Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into staging
virtio,pc,pci: features, fixes, cleanups
Features:
SR-IOV emulation for pci
virtio-mem-pci support for s390
interleave support for cxl
big endian support for vdpa svq
new QAPI events for vhost-user
Also vIOMMU reset order fixups are in.
Fixes, cleanups all over the place.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# -----BEGIN PGP SIGNATURE-----
#
# iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAme4b8sPHG1zdEByZWRo
# YXQuY29tAAoJECgfDbjSjVRpHKcIAKPJsVqPdda2dJ7b7FdyRT0Q+uwezXqaGHd4
# 7Lzih1wsxYNkwIAyPtEb76/21qiS7BluqlUCfCB66R9xWjP5/KfvAFj4/r4AEduE
# fxAgYzotNpv55zcRbcflMyvQ42WGiZZHC+o5Lp7vDXUP3pIyHrl0Ydh5WmcD+hwS
# BjXvda58TirQpPJ7rUL+sSfLih17zQkkDcfv5/AgorDy1wK09RBKwMx/gq7wG8yJ
# twy8eBY2CmfmFD7eTM+EKqBD2T0kwLEeLfS/F/tl5Fyg6lAiYgYtCbGLpAmWErsg
# XZvfZmwqL7CNzWexGvPFnnLyqwC33WUP0k0kT88Y5wh3/h98blw=
# =tej8
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 21 Feb 2025 20:21:31 HKT
# gpg: using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469
# gpg: issuer "mst@redhat.com"
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67
# Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469
* tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu: (41 commits)
docs/devel/reset: Document reset expectations for DMA and IOMMU
hw/vfio/common: Add a trace point in vfio_reset_handler
hw/arm/smmuv3: Move reset to exit phase
hw/i386/intel-iommu: Migrate to 3-phase reset
hw/virtio/virtio-iommu: Migrate to 3-phase reset
vhost-user-snd: correct the calculation of config_size
net: vhost-user: add QAPI events to report connection state
hw/virtio/virtio-nsm: Respond with correct length
vdpa: Fix endian bugs in shadow virtqueue
MAINTAINERS: add more files to `vhost`
cryptodev/vhost: allocate CryptoDevBackendVhost using g_mem0()
vhost-iova-tree: Update documentation
vhost-iova-tree, svq: Implement GPA->IOVA & partial IOVA->HVA trees
vhost-iova-tree: Implement an IOVA-only tree
amd_iommu: Use correct bitmask to set capability BAR
amd_iommu: Use correct DTE field for interrupt passthrough
hw/virtio: reset virtio balloon stats on machine reset
mem/cxl_type3: support 3, 6, 12 and 16 interleave ways
hw/mem/cxl_type3: Ensure errp is set on realization failure
hw/mem/cxl_type3: Fix special_ops memory leak on msix_init_exclusive_bar() failure
...
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Diffstat (limited to 'hw')
34 files changed, 561 insertions, 268 deletions
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index dd74c2e..8c1b407 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -924,7 +924,12 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) } } -static void smmu_base_reset_hold(Object *obj, ResetType type) +/* + * Make sure the IOMMU is reset in 'exit' phase after + * all outstanding DMA requests have been quiesced during + * the 'enter' or 'hold' reset phases + */ +static void smmu_base_reset_exit(Object *obj, ResetType type) { SMMUState *s = ARM_SMMU(obj); @@ -949,7 +954,7 @@ static void smmu_base_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, smmu_dev_properties); device_class_set_parent_realize(dc, smmu_base_realize, &sbc->parent_realize); - rc->phases.hold = smmu_base_reset_hold; + rc->phases.exit = smmu_base_reset_exit; } static const TypeInfo smmu_base_info = { diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index c0cf5df..b49a59b 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1870,13 +1870,19 @@ static void smmu_init_irq(SMMUv3State *s, SysBusDevice *dev) } } -static void smmu_reset_hold(Object *obj, ResetType type) +/* + * Make sure the IOMMU is reset in 'exit' phase after + * all outstanding DMA requests have been quiesced during + * the 'enter' or 'hold' reset phases + */ +static void smmu_reset_exit(Object *obj, ResetType type) { SMMUv3State *s = ARM_SMMUV3(obj); SMMUv3Class *c = ARM_SMMUV3_GET_CLASS(s); - if (c->parent_phases.hold) { - c->parent_phases.hold(obj, type); + trace_smmu_reset_exit(); + if (c->parent_phases.exit) { + c->parent_phases.exit(obj, type); } smmuv3_init_regs(s); @@ -1999,7 +2005,7 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) SMMUv3Class *c = ARM_SMMUV3_CLASS(klass); dc->vmsd = &vmstate_smmuv3; - resettable_class_set_parent_phases(rc, NULL, smmu_reset_hold, NULL, + resettable_class_set_parent_phases(rc, NULL, NULL, smmu_reset_exit, &c->parent_phases); device_class_set_parent_realize(dc, smmu_realize, &c->parent_realize); diff --git a/hw/arm/trace-events b/hw/arm/trace-events index c64ad34..7790db7 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -56,6 +56,7 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d" +smmu_reset_exit(void) "" # strongarm.c strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d" diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c index cd116c0..4738959 100644 --- a/hw/cxl/cxl-component-utils.c +++ b/hw/cxl/cxl-component-utils.c @@ -243,8 +243,13 @@ static void hdm_init_common(uint32_t *reg_state, uint32_t *write_msk, ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, INTERLEAVE_4K, 1); ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, POISON_ON_ERR_CAP, 0); - ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 3_6_12_WAY, 0); - ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 16_WAY, 0); + if (type == CXL2_TYPE3_DEVICE) { + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 3_6_12_WAY, 1); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 16_WAY, 1); + } else { + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 3_6_12_WAY, 0); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 16_WAY, 0); + } ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, UIO, 0); ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, UIO_DECODER_COUNT, 0); diff --git a/hw/cxl/cxl-device-utils.c b/hw/cxl/cxl-device-utils.c index 035d034..52ad1e4 100644 --- a/hw/cxl/cxl-device-utils.c +++ b/hw/cxl/cxl-device-utils.c @@ -352,10 +352,8 @@ static void device_reg_init_common(CXLDeviceState *cxl_dstate) } } -static void mailbox_reg_init_common(CXLDeviceState *cxl_dstate) +static void mailbox_reg_init_common(CXLDeviceState *cxl_dstate, int msi_n) { - const uint8_t msi_n = 9; - /* 2048 payload size */ ARRAY_FIELD_DP32(cxl_dstate->mbox_reg_state32, CXL_DEV_MAILBOX_CAP, PAYLOAD_SIZE, CXL_MAILBOX_PAYLOAD_SHIFT); @@ -382,7 +380,7 @@ static void memdev_reg_init_common(CXLDeviceState *cxl_dstate) cxl_dstate->memdev_status = memdev_status_reg; } -void cxl_device_register_init_t3(CXLType3Dev *ct3d) +void cxl_device_register_init_t3(CXLType3Dev *ct3d, int msi_n) { CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate; uint64_t *cap_h = cxl_dstate->caps_reg_state64; @@ -398,7 +396,7 @@ void cxl_device_register_init_t3(CXLType3Dev *ct3d) device_reg_init_common(cxl_dstate); cxl_device_cap_init(cxl_dstate, MAILBOX, 2, CXL_DEV_MAILBOX_VERSION); - mailbox_reg_init_common(cxl_dstate); + mailbox_reg_init_common(cxl_dstate, msi_n); cxl_device_cap_init(cxl_dstate, MEMORY_DEVICE, 0x4000, CXL_MEM_DEV_STATUS_VERSION); @@ -408,7 +406,7 @@ void cxl_device_register_init_t3(CXLType3Dev *ct3d) CXL_MAILBOX_MAX_PAYLOAD_SIZE); } -void cxl_device_register_init_swcci(CSWMBCCIDev *sw) +void cxl_device_register_init_swcci(CSWMBCCIDev *sw, int msi_n) { CXLDeviceState *cxl_dstate = &sw->cxl_dstate; uint64_t *cap_h = cxl_dstate->caps_reg_state64; @@ -423,7 +421,7 @@ void cxl_device_register_init_swcci(CSWMBCCIDev *sw) device_reg_init_common(cxl_dstate); cxl_device_cap_init(cxl_dstate, MAILBOX, 2, 1); - mailbox_reg_init_common(cxl_dstate); + mailbox_reg_init_common(cxl_dstate, msi_n); cxl_device_cap_init(cxl_dstate, MEMORY_DEVICE, 0x4000, 1); memdev_reg_init_common(cxl_dstate); diff --git a/hw/cxl/switch-mailbox-cci.c b/hw/cxl/switch-mailbox-cci.c index 65cdac6..833b824 100644 --- a/hw/cxl/switch-mailbox-cci.c +++ b/hw/cxl/switch-mailbox-cci.c @@ -17,10 +17,12 @@ #include "hw/qdev-properties.h" #include "hw/cxl/cxl.h" +#define CXL_SWCCI_MSIX_MBOX 3 + static void cswmbcci_reset(DeviceState *dev) { CSWMBCCIDev *cswmb = CXL_SWITCH_MAILBOX_CCI(dev); - cxl_device_register_init_swcci(cswmb); + cxl_device_register_init_swcci(cswmb, CXL_SWCCI_MSIX_MBOX); } static void cswbcci_realize(PCIDevice *pci_dev, Error **errp) diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index e8e084c..5b21cf1 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1309,15 +1309,15 @@ static int amdvi_int_remap_msi(AMDVIState *iommu, ret = -AMDVI_IR_ERR; break; case AMDVI_IOAPIC_INT_TYPE_NMI: - pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK; + pass = dte[2] & AMDVI_DEV_NMI_PASS_MASK; trace_amdvi_ir_delivery_mode("nmi"); break; case AMDVI_IOAPIC_INT_TYPE_INIT: - pass = dte[3] & AMDVI_DEV_INT_PASS_MASK; + pass = dte[2] & AMDVI_DEV_INT_PASS_MASK; trace_amdvi_ir_delivery_mode("init"); break; case AMDVI_IOAPIC_INT_TYPE_EINT: - pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK; + pass = dte[2] & AMDVI_DEV_EINT_PASS_MASK; trace_amdvi_ir_delivery_mode("eint"); break; default: @@ -1593,9 +1593,9 @@ static void amdvi_pci_realize(PCIDevice *pdev, Error **errp) /* reset AMDVI specific capabilities, all r/o */ pci_set_long(pdev->config + s->capab_offset, AMDVI_CAPAB_FEATURES); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_LOW, - AMDVI_BASE_ADDR & ~(0xffff0000)); + AMDVI_BASE_ADDR & MAKE_64BIT_MASK(14, 18)); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH, - (AMDVI_BASE_ADDR & ~(0xffff)) >> 16); + AMDVI_BASE_ADDR >> 32); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_RANGE, 0xff000000); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_MISC, 0); diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index e0dac4d..2812513 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -187,7 +187,7 @@ AMDVI_CAPAB_FLAG_HTTUNNEL | AMDVI_CAPAB_EFR_SUP) /* AMDVI default address */ -#define AMDVI_BASE_ADDR 0xfed80000 +#define AMDVI_BASE_ADDR 0xfed80000ULL /* page management constants */ #define AMDVI_PAGE_SHIFT 12 diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 7fde060..dffd7ee 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -4697,10 +4697,11 @@ static void vtd_init(IntelIOMMUState *s) /* Should not reset address_spaces when reset because devices will still use * the address space they got at first (won't ask the bus again). */ -static void vtd_reset(DeviceState *dev) +static void vtd_reset_exit(Object *obj, ResetType type) { - IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); + IntelIOMMUState *s = INTEL_IOMMU_DEVICE(obj); + trace_vtd_reset_exit(); vtd_init(s); vtd_address_space_refresh_all(s); } @@ -4864,8 +4865,13 @@ static void vtd_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass); + ResettableClass *rc = RESETTABLE_CLASS(klass); - device_class_set_legacy_reset(dc, vtd_reset); + /* + * Use 'exit' reset phase to make sure all DMA requests + * have been quiesced during 'enter' or 'hold' phase + */ + rc->phases.exit = vtd_reset_exit; dc->vmsd = &vtd_vmstate; device_class_set_props(dc, vtd_properties); dc->hotpluggable = false; diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c index a8d354a..d0a236c 100644 --- a/hw/i386/microvm.c +++ b/hw/i386/microvm.c @@ -451,11 +451,44 @@ static HotplugHandler *microvm_get_hotplug_handler(MachineState *machine, return NULL; } +static void microvm_machine_done(Notifier *notifier, void *data) +{ + MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, + machine_done); + X86MachineState *x86ms = X86_MACHINE(mms); + + acpi_setup_microvm(mms); + dt_setup_microvm(mms); + fw_cfg_add_e820(x86ms->fw_cfg); +} + +static void microvm_powerdown_req(Notifier *notifier, void *data) +{ + MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, + powerdown_req); + X86MachineState *x86ms = X86_MACHINE(mms); + + if (x86ms->acpi_dev) { + Object *obj = OBJECT(x86ms->acpi_dev); + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj); + adevc->send_event(ACPI_DEVICE_IF(x86ms->acpi_dev), + ACPI_POWER_DOWN_STATUS); + } +} + static void microvm_machine_state_init(MachineState *machine) { MicrovmMachineState *mms = MICROVM_MACHINE(machine); X86MachineState *x86ms = X86_MACHINE(machine); + /* State */ + mms->kernel_cmdline_fixed = false; + + mms->machine_done.notify = microvm_machine_done; + qemu_add_machine_init_done_notifier(&mms->machine_done); + mms->powerdown_req.notify = microvm_powerdown_req; + qemu_register_powerdown_notifier(&mms->powerdown_req); + microvm_memory_init(mms); x86_cpus_init(x86ms, CPU_VERSION_LATEST); @@ -581,31 +614,6 @@ static void microvm_machine_set_auto_kernel_cmdline(Object *obj, bool value, mms->auto_kernel_cmdline = value; } -static void microvm_machine_done(Notifier *notifier, void *data) -{ - MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, - machine_done); - X86MachineState *x86ms = X86_MACHINE(mms); - - acpi_setup_microvm(mms); - dt_setup_microvm(mms); - fw_cfg_add_e820(x86ms->fw_cfg); -} - -static void microvm_powerdown_req(Notifier *notifier, void *data) -{ - MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, - powerdown_req); - X86MachineState *x86ms = X86_MACHINE(mms); - - if (x86ms->acpi_dev) { - Object *obj = OBJECT(x86ms->acpi_dev); - AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj); - adevc->send_event(ACPI_DEVICE_IF(x86ms->acpi_dev), - ACPI_POWER_DOWN_STATUS); - } -} - static void microvm_machine_initfn(Object *obj) { MicrovmMachineState *mms = MICROVM_MACHINE(obj); @@ -617,14 +625,6 @@ static void microvm_machine_initfn(Object *obj) mms->isa_serial = true; mms->option_roms = true; mms->auto_kernel_cmdline = true; - - /* State */ - mms->kernel_cmdline_fixed = false; - - mms->machine_done.notify = microvm_machine_done; - qemu_add_machine_init_done_notifier(&mms->machine_done); - mms->powerdown_req.notify = microvm_powerdown_req; - qemu_register_powerdown_notifier(&mms->powerdown_req); } GlobalProperty microvm_properties[] = { diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 22641e6..f199a8c 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1241,6 +1241,9 @@ void pc_basic_device_init(struct PCMachineState *pcms, /* Super I/O */ pc_superio_init(isa_bus, create_fdctrl, pcms->i8042_enabled, pcms->vmport != ON_OFF_AUTO_ON, &error_fatal); + + pcms->machine_done.notify = pc_machine_done; + qemu_add_machine_init_done_notifier(&pcms->machine_done); } void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) @@ -1714,9 +1717,6 @@ static void pc_machine_initfn(Object *obj) if (pcmc->pci_enabled) { cxl_machine_init(obj, &pcms->cxl_devices_state); } - - pcms->machine_done.notify = pc_machine_done; - qemu_add_machine_init_done_notifier(&pcms->machine_done); } static void pc_machine_reset(MachineState *machine, ResetType type) diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 53c02d7..ac9e1a1 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -68,6 +68,7 @@ vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16 vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)" vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)" +vtd_reset_exit(void) "" # amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index 0ae1704..6fffa21 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -30,6 +30,14 @@ #include "hw/cxl/cxl.h" #include "hw/pci/msix.h" +/* type3 device private */ +enum CXL_T3_MSIX_VECTOR { + CXL_T3_MSIX_PCIE_DOE_TABLE_ACCESS = 0, + CXL_T3_MSIX_EVENT_START = 2, + CXL_T3_MSIX_MBOX = CXL_T3_MSIX_EVENT_START + CXL_EVENT_TYPE_MAX, + CXL_T3_MSIX_VECTOR_NR +}; + #define DWORD_BYTE 4 #define CXL_CAPACITY_MULTIPLIER (256 * MiB) @@ -843,7 +851,6 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) ComponentRegisters *regs = &cxl_cstate->crb; MemoryRegion *mr = ®s->component_registers; uint8_t *pci_conf = pci_dev->config; - unsigned short msix_num = 10; int i, rc; uint16_t count; @@ -884,31 +891,32 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) &ct3d->cxl_dstate.device_registers); /* MSI(-X) Initialization */ - rc = msix_init_exclusive_bar(pci_dev, msix_num, 4, NULL); + rc = msix_init_exclusive_bar(pci_dev, CXL_T3_MSIX_VECTOR_NR, 4, errp); if (rc) { - goto err_address_space_free; + goto err_free_special_ops; } - for (i = 0; i < msix_num; i++) { + for (i = 0; i < CXL_T3_MSIX_VECTOR_NR; i++) { msix_vector_use(pci_dev, i); } /* DOE Initialization */ - pcie_doe_init(pci_dev, &ct3d->doe_cdat, 0x190, doe_cdat_prot, true, 0); + pcie_doe_init(pci_dev, &ct3d->doe_cdat, 0x190, doe_cdat_prot, true, + CXL_T3_MSIX_PCIE_DOE_TABLE_ACCESS); cxl_cstate->cdat.build_cdat_table = ct3_build_cdat_table; cxl_cstate->cdat.free_cdat_table = ct3_free_cdat_table; cxl_cstate->cdat.private = ct3d; if (!cxl_doe_cdat_init(cxl_cstate, errp)) { - goto err_free_special_ops; + goto err_msix_uninit; } pcie_cap_deverr_init(pci_dev); /* Leave a bit of room for expansion */ - rc = pcie_aer_init(pci_dev, PCI_ERR_VER, 0x200, PCI_ERR_SIZEOF, NULL); + rc = pcie_aer_init(pci_dev, PCI_ERR_VER, 0x200, PCI_ERR_SIZEOF, errp); if (rc) { goto err_release_cdat; } - cxl_event_init(&ct3d->cxl_dstate, 2); + cxl_event_init(&ct3d->cxl_dstate, CXL_T3_MSIX_EVENT_START); /* Set default value for patrol scrub attributes */ ct3d->patrol_scrub_attrs.scrub_cycle_cap = @@ -935,9 +943,10 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) err_release_cdat: cxl_doe_cdat_release(cxl_cstate); +err_msix_uninit: + msix_uninit_exclusive_bar(pci_dev); err_free_special_ops: g_free(regs->special_ops); -err_address_space_free: if (ct3d->dc.host_dc) { cxl_destroy_dc_regions(ct3d); address_space_destroy(&ct3d->dc.host_dc_as); @@ -959,6 +968,7 @@ static void ct3_exit(PCIDevice *pci_dev) pcie_aer_exit(pci_dev); cxl_doe_cdat_release(cxl_cstate); + msix_uninit_exclusive_bar(pci_dev); g_free(regs->special_ops); if (ct3d->dc.host_dc) { cxl_destroy_dc_regions(ct3d); @@ -1090,10 +1100,17 @@ static bool cxl_type3_dpa(CXLType3Dev *ct3d, hwaddr host_addr, uint64_t *dpa) continue; } - *dpa = dpa_base + - ((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | - ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & hpa_offset) - >> iw)); + if (iw < 8) { + *dpa = dpa_base + + ((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | + ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & hpa_offset) + >> iw)); + } else { + *dpa = dpa_base + + ((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | + ((((MAKE_64BIT_MASK(ig + iw, 64 - ig - iw) & hpa_offset) + >> (ig + iw)) / 3) << (ig + 8))); + } return true; } @@ -1202,7 +1219,7 @@ static void ct3d_reset(DeviceState *dev) pcie_cap_fill_link_ep_usp(PCI_DEVICE(dev), ct3d->width, ct3d->speed); cxl_component_register_init_common(reg_state, write_msk, CXL2_TYPE3_DEVICE); - cxl_device_register_init_t3(ct3d); + cxl_device_register_init_t3(ct3d, CXL_T3_MSIX_MBOX); /* * Bring up an endpoint to target with MCTP over VDM. diff --git a/hw/net/igb.c b/hw/net/igb.c index 4d93ce6..c965fc2 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -446,9 +446,13 @@ static void igb_pci_realize(PCIDevice *pci_dev, Error **errp) pcie_ari_init(pci_dev, 0x150); - pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, TYPE_IGBVF, - IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, IGB_MAX_VF_FUNCTIONS, - IGB_VF_OFFSET, IGB_VF_STRIDE); + if (!pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, TYPE_IGBVF, + IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, + IGB_MAX_VF_FUNCTIONS, IGB_VF_OFFSET, IGB_VF_STRIDE, + errp)) { + igb_cleanup_msix(s); + return; + } pcie_sriov_pf_init_vf_bar(pci_dev, IGBVF_MMIO_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH, diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index d847429..de87cfa 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1352,18 +1352,25 @@ exit: static bool virtio_net_load_ebpf(VirtIONet *n, Error **errp) { - bool ret = false; + if (!virtio_net_attach_ebpf_to_backend(n->nic, -1)) { + return true; + } - if (virtio_net_attach_ebpf_to_backend(n->nic, -1)) { - trace_virtio_net_rss_load(n, n->nr_ebpf_rss_fds, n->ebpf_rss_fds); - if (n->ebpf_rss_fds) { - ret = virtio_net_load_ebpf_fds(n, errp); - } else { - ret = ebpf_rss_load(&n->ebpf_rss, errp); - } + trace_virtio_net_rss_load(n, n->nr_ebpf_rss_fds, n->ebpf_rss_fds); + + /* + * If user explicitly gave QEMU RSS FDs to use, then + * failing to use them must be considered a fatal + * error. If no RSS FDs were provided, QEMU is trying + * eBPF on a "best effort" basis only, so report a + * warning and allow fallback to software RSS. + */ + if (n->ebpf_rss_fds) { + return virtio_net_load_ebpf_fds(n, errp); } - return ret; + ebpf_rss_load(&n->ebpf_rss, &error_warn); + return true; } static void virtio_net_unload_ebpf(VirtIONet *n) @@ -3913,23 +3920,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) net_rx_pkt_init(&n->rx_pkt); if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) { - Error *err = NULL; - if (!virtio_net_load_ebpf(n, &err)) { - /* - * If user explicitly gave QEMU RSS FDs to use, then - * failing to use them must be considered a fatal - * error. If no RSS FDs were provided, QEMU is trying - * eBPF on a "best effort" basis only, so report a - * warning and allow fallback to software RSS. - */ - if (n->ebpf_rss_fds) { - error_propagate(errp, err); - } else { - warn_report("unable to load eBPF RSS: %s", - error_get_pretty(err)); - error_free(err); - } - } + virtio_net_load_ebpf(n, errp); } } diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 68903d1..8175751 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8481,7 +8481,8 @@ out: return pow2ceil(bar_size); } -static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) +static bool nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset, + Error **errp) { uint16_t vf_dev_id = n->params.use_intel_id ? PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME; @@ -8490,12 +8491,16 @@ static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) le16_to_cpu(cap->vifrsm), NULL, NULL); - pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id, - n->params.sriov_max_vfs, n->params.sriov_max_vfs, - NVME_VF_OFFSET, NVME_VF_STRIDE); + if (!pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id, + n->params.sriov_max_vfs, n->params.sriov_max_vfs, + NVME_VF_OFFSET, NVME_VF_STRIDE, errp)) { + return false; + } pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size); + + return true; } static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) @@ -8620,6 +8625,11 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) return false; } + if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs && + !nvme_init_sriov(n, pci_dev, 0x120, errp)) { + return false; + } + nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize); pcie_cap_deverr_init(pci_dev); @@ -8649,10 +8659,6 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) nvme_init_pmr(n, pci_dev); } - if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) { - nvme_init_sriov(n, pci_dev, 0x120); - } - return true; } diff --git a/hw/pci/msix.c b/hw/pci/msix.c index 57ec708..66f27b9 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -15,6 +15,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "hw/pci/pci.h" @@ -260,6 +261,14 @@ static uint64_t msix_pba_mmio_read(void *opaque, hwaddr addr, static void msix_pba_mmio_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) { + PCIDevice *dev = opaque; + + qemu_log_mask(LOG_GUEST_ERROR, + "PCI [%s:%02x:%02x.%x] attempt to write to MSI-X " + "PBA at 0x%" FMT_PCIBUS ", ignoring.\n", + pci_root_bus_path(dev), pci_dev_bus_num(dev), + PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), + addr); } static const MemoryRegionOps msix_pba_mmio_ops = { diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 2afa423..1d42847 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -803,10 +803,17 @@ static bool migrate_is_not_pcie(void *opaque, int version_id) return !pci_is_express((PCIDevice *)opaque); } +static int pci_post_load(void *opaque, int version_id) +{ + pcie_sriov_pf_post_load(opaque); + return 0; +} + const VMStateDescription vmstate_pci_device = { .name = "PCIDevice", .version_id = 2, .minimum_version_id = 1, + .post_load = pci_post_load, .fields = (const VMStateField[]) { VMSTATE_INT32_POSITIVE_LE(version_id, PCIDevice), VMSTATE_BUFFER_UNSAFE_INFO_TEST(config, PCIDevice, @@ -1391,6 +1398,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, assert(hdr_type != PCI_HEADER_TYPE_BRIDGE || region_num < 2); r = &pci_dev->io_regions[region_num]; + assert(!r->size); r->addr = PCI_BAR_UNMAPPED; r->size = size; r->type = type; @@ -2963,7 +2971,17 @@ MSIMessage pci_get_msi_message(PCIDevice *dev, int vector) void pci_set_power(PCIDevice *d, bool state) { - pci_set_enabled(d, state); + /* + * Don't change the enabled state of VFs when powering on/off the device. + * + * When powering on, VFs must not be enabled immediately but they must + * wait until the guest configures SR-IOV. + * When powering off, their corresponding PFs will be reset and disable + * VFs. + */ + if (!pci_is_vf(d)) { + pci_set_enabled(d, state); + } } void pci_set_enabled(PCIDevice *d, bool state) @@ -2977,7 +2995,7 @@ void pci_set_enabled(PCIDevice *d, bool state) memory_region_set_enabled(&d->bus_master_enable_region, (pci_get_word(d->config + PCI_COMMAND) & PCI_COMMAND_MASTER) && d->enabled); - if (!d->enabled) { + if (qdev_is_realized(&d->qdev)) { pci_device_reset(d); } } diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index e9b2322..1eb4358 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -20,23 +20,37 @@ #include "qapi/error.h" #include "trace.h" -static PCIDevice *register_vf(PCIDevice *pf, int devfn, - const char *name, uint16_t vf_num); -static void unregister_vfs(PCIDevice *dev); +static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs) +{ + for (uint16_t i = 0; i < total_vfs; i++) { + PCIDevice *vf = dev->exp.sriov_pf.vf[i]; + object_unparent(OBJECT(vf)); + object_unref(OBJECT(vf)); + } + g_free(dev->exp.sriov_pf.vf); + dev->exp.sriov_pf.vf = NULL; +} -void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, +bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, const char *vfname, uint16_t vf_dev_id, uint16_t init_vfs, uint16_t total_vfs, - uint16_t vf_offset, uint16_t vf_stride) + uint16_t vf_offset, uint16_t vf_stride, + Error **errp) { + BusState *bus = qdev_get_parent_bus(&dev->qdev); + int32_t devfn = dev->devfn + vf_offset; uint8_t *cfg = dev->config + offset; uint8_t *wmask; + if (total_vfs && + (uint32_t)devfn + (uint32_t)(total_vfs - 1) * vf_stride >= PCI_DEVFN_MAX) { + error_setg(errp, "VF addr overflows"); + return false; + } + pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1, offset, PCI_EXT_CAP_SRIOV_SIZEOF); dev->exp.sriov_cap = offset; - dev->exp.sriov_pf.num_vfs = 0; - dev->exp.sriov_pf.vfname = g_strdup(vfname); dev->exp.sriov_pf.vf = NULL; pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset); @@ -69,13 +83,37 @@ void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, pci_set_word(wmask + PCI_SRIOV_SYS_PGSIZE, 0x553); qdev_prop_set_bit(&dev->qdev, "multifunction", true); + + dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs); + + for (uint16_t i = 0; i < total_vfs; i++) { + PCIDevice *vf = pci_new(devfn, vfname); + vf->exp.sriov_vf.pf = dev; + vf->exp.sriov_vf.vf_number = i; + + if (!qdev_realize(&vf->qdev, bus, errp)) { + object_unparent(OBJECT(vf)); + object_unref(vf); + unparent_vfs(dev, i); + return false; + } + + /* set vid/did according to sr/iov spec - they are not used */ + pci_config_set_vendor_id(vf->config, 0xffff); + pci_config_set_device_id(vf->config, 0xffff); + + dev->exp.sriov_pf.vf[i] = vf; + devfn += vf_stride; + } + + return true; } void pcie_sriov_pf_exit(PCIDevice *dev) { - unregister_vfs(dev); - g_free((char *)dev->exp.sriov_pf.vfname); - dev->exp.sriov_pf.vfname = NULL; + uint8_t *cfg = dev->config + dev->exp.sriov_cap; + + unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); } void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, @@ -141,80 +179,36 @@ void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, } } -static PCIDevice *register_vf(PCIDevice *pf, int devfn, const char *name, - uint16_t vf_num) -{ - PCIDevice *dev = pci_new(devfn, name); - dev->exp.sriov_vf.pf = pf; - dev->exp.sriov_vf.vf_number = vf_num; - PCIBus *bus = pci_get_bus(pf); - Error *local_err = NULL; - - qdev_realize(&dev->qdev, &bus->qbus, &local_err); - if (local_err) { - error_report_err(local_err); - return NULL; - } - - /* set vid/did according to sr/iov spec - they are not used */ - pci_config_set_vendor_id(dev->config, 0xffff); - pci_config_set_device_id(dev->config, 0xffff); - - return dev; -} - static void register_vfs(PCIDevice *dev) { uint16_t num_vfs; uint16_t i; uint16_t sriov_cap = dev->exp.sriov_cap; - uint16_t vf_offset = - pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_OFFSET); - uint16_t vf_stride = - pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_STRIDE); - int32_t devfn = dev->devfn + vf_offset; assert(sriov_cap > 0); num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); - if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { - return; - } - - dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs); trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), num_vfs); for (i = 0; i < num_vfs; i++) { - dev->exp.sriov_pf.vf[i] = register_vf(dev, devfn, - dev->exp.sriov_pf.vfname, i); - if (!dev->exp.sriov_pf.vf[i]) { - num_vfs = i; - break; - } - devfn += vf_stride; + pci_set_enabled(dev->exp.sriov_pf.vf[i], true); } - dev->exp.sriov_pf.num_vfs = num_vfs; + + pci_set_word(dev->wmask + sriov_cap + PCI_SRIOV_NUM_VF, 0); } static void unregister_vfs(PCIDevice *dev) { - uint16_t num_vfs = dev->exp.sriov_pf.num_vfs; + uint8_t *cfg = dev->config + dev->exp.sriov_cap; uint16_t i; trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), num_vfs); - for (i = 0; i < num_vfs; i++) { - Error *err = NULL; - PCIDevice *vf = dev->exp.sriov_pf.vf[i]; - if (!object_property_set_bool(OBJECT(vf), "realized", false, &err)) { - error_reportf_err(err, "Failed to unplug: "); - } - object_unparent(OBJECT(vf)); - object_unref(OBJECT(vf)); + PCI_FUNC(dev->devfn)); + for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) { + pci_set_enabled(dev->exp.sriov_pf.vf[i], false); } - g_free(dev->exp.sriov_pf.vf); - dev->exp.sriov_pf.vf = NULL; - dev->exp.sriov_pf.num_vfs = 0; + + pci_set_word(dev->wmask + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0xffff); } void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, @@ -235,15 +229,29 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, PCI_FUNC(dev->devfn), off, val, len); if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { - if (dev->exp.sriov_pf.num_vfs) { - if (!(val & PCI_SRIOV_CTRL_VFE)) { - unregister_vfs(dev); - } + if (val & PCI_SRIOV_CTRL_VFE) { + register_vfs(dev); } else { - if (val & PCI_SRIOV_CTRL_VFE) { - register_vfs(dev); - } + unregister_vfs(dev); } + } else if (range_covers_byte(off, len, PCI_SRIOV_NUM_VF)) { + uint8_t *cfg = dev->config + sriov_cap; + uint8_t *wmask = dev->wmask + sriov_cap; + uint16_t num_vfs = pci_get_word(cfg + PCI_SRIOV_NUM_VF); + uint16_t wmask_val = PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI; + + if (num_vfs <= pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)) { + wmask_val |= PCI_SRIOV_CTRL_VFE; + } + + pci_set_word(wmask + PCI_SRIOV_CTRL, wmask_val); + } +} + +void pcie_sriov_pf_post_load(PCIDevice *dev) +{ + if (dev->exp.sriov_cap) { + register_vfs(dev); } } @@ -260,6 +268,8 @@ void pcie_sriov_pf_reset(PCIDevice *dev) unregister_vfs(dev); pci_set_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF, 0); + pci_set_word(dev->wmask + sriov_cap + PCI_SRIOV_CTRL, + PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI); /* * Default is to use 4K pages, software can modify it @@ -306,7 +316,7 @@ PCIDevice *pcie_sriov_get_pf(PCIDevice *dev) PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n) { assert(!pci_is_vf(dev)); - if (n < dev->exp.sriov_pf.num_vfs) { + if (n < pcie_sriov_num_vfs(dev)) { return dev->exp.sriov_pf.vf[n]; } return NULL; @@ -314,5 +324,10 @@ PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n) uint16_t pcie_sriov_num_vfs(PCIDevice *dev) { - return dev->exp.sriov_pf.num_vfs; + uint16_t sriov_cap = dev->exp.sriov_cap; + uint8_t *cfg = dev->config + sriov_cap; + + return sriov_cap && + (pci_get_word(cfg + PCI_SRIOV_CTRL) & PCI_SRIOV_CTRL_VFE) ? + pci_get_word(cfg + PCI_SRIOV_NUM_VF) : 0; } diff --git a/hw/pci/trace-events b/hw/pci/trace-events index 19643aa..e98f575 100644 --- a/hw/pci/trace-events +++ b/hw/pci/trace-events @@ -14,7 +14,7 @@ msix_write_config(char *name, bool enabled, bool masked) "dev %s enabled %d mask # hw/pci/pcie_sriov.c sriov_register_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: creating %d vf devs" -sriov_unregister_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: Unregistering %d vf devs" +sriov_unregister_vfs(const char *name, int slot, int function) "%s %02x:%x: Unregistering vf devs" sriov_config_write(const char *name, int slot, int fun, uint32_t offset, uint32_t val, uint32_t len) "%s %02x:%x: sriov offset 0x%x val 0x%x len %d" # pcie.c diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 904227d..e0a9d50 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -1283,8 +1283,7 @@ static void spapr_dt_pci_device_cb(PCIBus *bus, PCIDevice *pdev, PciWalkFdt *p = opaque; int err; - if (p->err) { - /* Something's already broken, don't keep going */ + if (p->err || !pdev->enabled) { return; } @@ -1550,7 +1549,9 @@ static void spapr_pci_pre_plug(HotplugHandler *plug_handler, * hotplug, we do not allow functions to be hotplugged to a * slot that already has function 0 present */ - if (plugged_dev->hotplugged && bus->devices[PCI_DEVFN(slotnr, 0)] && + if (plugged_dev->hotplugged && + !pci_is_vf(pdev) && + bus->devices[PCI_DEVFN(slotnr, 0)] && PCI_FUNC(pdev->devfn) != 0) { error_setg(errp, "PCI: slot %d function 0 already occupied by %s," " additional functions can no longer be exposed to guest.", @@ -1572,6 +1573,14 @@ static void spapr_pci_plug(HotplugHandler *plug_handler, SpaprDrc *drc = drc_from_dev(phb, pdev); uint32_t slotnr = PCI_SLOT(pdev->devfn); + /* + * If DR or the PCI device is disabled we don't need to do anything + * in the case of hotplug or coldplug callbacks. + */ + if (!pdev->enabled) { + return; + } + g_assert(drc); if (IS_PCI_BRIDGE(plugged_dev)) { @@ -1647,6 +1656,11 @@ static void spapr_pci_unplug_request(HotplugHandler *plug_handler, SpaprDrc *drc = drc_from_dev(phb, pdev); g_assert(drc); + + if (!drc->dev) { + return; + } + g_assert(drc->dev == plugged_dev); if (!spapr_drc_unplug_requested(drc)) { diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index eead269..913d72c 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -971,14 +971,7 @@ static void s390_pcihost_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, "this device"); } - if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { - PCIDevice *pdev = PCI_DEVICE(dev); - - if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { - error_setg(errp, "multifunction not supported in s390"); - return; - } - } else if (object_dynamic_cast(OBJECT(dev), TYPE_S390_PCI_DEVICE)) { + if (object_dynamic_cast(OBJECT(dev), TYPE_S390_PCI_DEVICE)) { S390PCIBusDevice *pbdev = S390_PCI_DEVICE(dev); if (!s390_pci_alloc_idx(s, pbdev)) { @@ -1069,6 +1062,18 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { pdev = PCI_DEVICE(dev); + /* + * Multifunction is not supported due to the lack of CLP. However, + * do not check for multifunction capability for SR-IOV devices because + * SR-IOV devices automatically add the multifunction capability whether + * the user intends to use the functions other than the PF. + */ + if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION && + !pdev->exp.sriov_cap) { + error_setg(errp, "multifunction not supported in s390"); + return; + } + if (!dev->id) { /* In the case the PCI device does not define an id */ /* we generate one based on the PCI address */ @@ -1080,6 +1085,16 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, pbdev = s390_pci_find_dev_by_target(s, dev->id); if (!pbdev) { + /* + * VFs are automatically created by PF, and creating zpci for them + * will result in unexpected usage of fids. Currently QEMU does not + * support multifunction for s390x so we don't need zpci for VFs + * anyway. + */ + if (pci_is_vf(pdev)) { + return; + } + pbdev = s390_pci_device_new(s, dev->id, errp); if (!pbdev) { return; @@ -1167,7 +1182,10 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, int32_t devfn; pbdev = s390_pci_find_dev_by_pci(s, PCI_DEVICE(dev)); - g_assert(pbdev); + if (!pbdev) { + g_assert(pci_is_vf(pci_dev)); + return; + } s390_pci_generate_plug_event(HP_EVENT_STANDBY_TO_RESERVED, pbdev->fh, pbdev->fid); @@ -1206,7 +1224,11 @@ static void s390_pcihost_unplug_request(HotplugHandler *hotplug_dev, * we've checked the PCI device already (to prevent endless recursion). */ pbdev = s390_pci_find_dev_by_pci(s, PCI_DEVICE(dev)); - g_assert(pbdev); + if (!pbdev) { + g_assert(pci_is_vf(PCI_DEVICE(dev))); + return; + } + pbdev->pci_unplug_request_processed = true; qdev_unplug(DEVICE(pbdev), errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_S390_PCI_DEVICE)) { diff --git a/hw/vfio/common.c b/hw/vfio/common.c index abbdc56..7a4010e 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1404,6 +1404,7 @@ void vfio_reset_handler(void *opaque) { VFIODevice *vbasedev; + trace_vfio_reset_handler(); QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { if (vbasedev->dev->realized) { vbasedev->ops->vfio_compute_needs_reset(vbasedev); diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index cab1cf1..c5385e1 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -120,6 +120,7 @@ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype vfio_legacy_dma_unmap_overflow_workaround(void) "" vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 +vfio_reset_handler(void) "" # platform.c vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s" diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 04e36ae..76f0d45 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -108,7 +108,7 @@ virtio_pci_notify_write(uint64_t addr, uint64_t val, unsigned int size) "0x%" PR virtio_pci_notify_write_pio(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)" # hw/virtio/virtio-iommu.c -virtio_iommu_device_reset(void) "reset!" +virtio_iommu_device_reset_exit(void) "reset!" virtio_iommu_system_reset(void) "system reset!" virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 virtio_iommu_device_status(uint8_t status) "driver status = %d" diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c index 3d03395..fa4147b 100644 --- a/hw/virtio/vhost-iova-tree.c +++ b/hw/virtio/vhost-iova-tree.c @@ -28,12 +28,18 @@ struct VhostIOVATree { /* IOVA address to qemu memory maps. */ IOVATree *iova_taddr_map; + + /* Allocated IOVA addresses */ + IOVATree *iova_map; + + /* GPA->IOVA address memory maps */ + IOVATree *gpa_iova_map; }; /** - * Create a new IOVA tree + * Create a new VhostIOVATree * - * Returns the new IOVA tree + * Returns the new VhostIOVATree. */ VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last) { @@ -44,25 +50,29 @@ VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last) tree->iova_last = iova_last; tree->iova_taddr_map = iova_tree_new(); + tree->iova_map = iova_tree_new(); + tree->gpa_iova_map = gpa_tree_new(); return tree; } /** - * Delete an iova tree + * Delete a VhostIOVATree */ void vhost_iova_tree_delete(VhostIOVATree *iova_tree) { iova_tree_destroy(iova_tree->iova_taddr_map); + iova_tree_destroy(iova_tree->iova_map); + iova_tree_destroy(iova_tree->gpa_iova_map); g_free(iova_tree); } /** * Find the IOVA address stored from a memory address * - * @tree: The iova tree + * @tree: The VhostIOVATree * @map: The map with the memory address * - * Return the stored mapping, or NULL if not found. + * Returns the stored IOVA->HVA mapping, or NULL if not found. */ const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree, const DMAMap *map) @@ -71,40 +81,111 @@ const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree, } /** - * Allocate a new mapping + * Allocate a new IOVA range and add the mapping to the IOVA->HVA tree * - * @tree: The iova tree - * @map: The iova map + * @tree: The VhostIOVATree + * @map: The IOVA mapping + * @taddr: The translated address (HVA) * * Returns: * - IOVA_OK if the map fits in the container * - IOVA_ERR_INVALID if the map does not make sense (like size overflow) * - IOVA_ERR_NOMEM if tree cannot allocate more space. * - * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK. + * It returns an assigned IOVA in map->iova if the return value is IOVA_OK. */ -int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map) +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map, hwaddr taddr) { + int ret; + /* Some vhost devices do not like addr 0. Skip first page */ hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size(); - if (map->translated_addr + map->size < map->translated_addr || - map->perm == IOMMU_NONE) { + if (taddr + map->size < taddr || map->perm == IOMMU_NONE) { return IOVA_ERR_INVALID; } - /* Allocate a node in IOVA address */ - return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first, - tree->iova_last); + /* Allocate a node in the IOVA-only tree */ + ret = iova_tree_alloc_map(tree->iova_map, map, iova_first, tree->iova_last); + if (unlikely(ret != IOVA_OK)) { + return ret; + } + + /* Insert a node in the IOVA->HVA tree */ + map->translated_addr = taddr; + return iova_tree_insert(tree->iova_taddr_map, map); } /** - * Remove existing mappings from iova tree + * Remove existing mappings from the IOVA-only and IOVA->HVA trees * - * @iova_tree: The vhost iova tree + * @iova_tree: The VhostIOVATree * @map: The map to remove */ void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map) { iova_tree_remove(iova_tree->iova_taddr_map, map); + iova_tree_remove(iova_tree->iova_map, map); +} + +/** + * Find the IOVA address stored from a guest memory address (GPA) + * + * @tree: The VhostIOVATree + * @map: The map with the guest memory address + * + * Returns the stored GPA->IOVA mapping, or NULL if not found. + */ +const DMAMap *vhost_iova_tree_find_gpa(const VhostIOVATree *tree, + const DMAMap *map) +{ + return iova_tree_find_iova(tree->gpa_iova_map, map); +} + +/** + * Allocate a new IOVA range and add the mapping to the GPA->IOVA tree + * + * @tree: The VhostIOVATree + * @map: The IOVA mapping + * @taddr: The translated address (GPA) + * + * Returns: + * - IOVA_OK if the map fits both containers + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow) + * - IOVA_ERR_NOMEM if the IOVA-only tree cannot allocate more space + * + * It returns an assigned IOVA in map->iova if the return value is IOVA_OK. + */ +int vhost_iova_tree_map_alloc_gpa(VhostIOVATree *tree, DMAMap *map, hwaddr taddr) +{ + int ret; + + /* Some vhost devices don't like addr 0. Skip first page */ + hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size(); + + if (taddr + map->size < taddr || map->perm == IOMMU_NONE) { + return IOVA_ERR_INVALID; + } + + /* Allocate a node in the IOVA-only tree */ + ret = iova_tree_alloc_map(tree->iova_map, map, iova_first, tree->iova_last); + if (unlikely(ret != IOVA_OK)) { + return ret; + } + + /* Insert a node in the GPA->IOVA tree */ + map->translated_addr = taddr; + return gpa_tree_insert(tree->gpa_iova_map, map); +} + +/** + * Remove existing mappings from the IOVA-only and GPA->IOVA trees + * + * @tree: The VhostIOVATree + * @map: The map to remove + */ +void vhost_iova_tree_remove_gpa(VhostIOVATree *iova_tree, DMAMap map) +{ + iova_tree_remove(iova_tree->gpa_iova_map, map); + iova_tree_remove(iova_tree->iova_map, map); } diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h index 4adfd79..0c4ba5a 100644 --- a/hw/virtio/vhost-iova-tree.h +++ b/hw/virtio/vhost-iova-tree.h @@ -21,7 +21,13 @@ G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete); const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree, const DMAMap *map); -int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map); +int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map, + hwaddr taddr); void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map); +const DMAMap *vhost_iova_tree_find_gpa(const VhostIOVATree *iova_tree, + const DMAMap *map); +int vhost_iova_tree_map_alloc_gpa(VhostIOVATree *iova_tree, DMAMap *map, + hwaddr taddr); +void vhost_iova_tree_remove_gpa(VhostIOVATree *iova_tree, DMAMap map); #endif diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 37aca8b..2481d49 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -78,24 +78,39 @@ uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq) * @vaddr: Translated IOVA addresses * @iovec: Source qemu's VA addresses * @num: Length of iovec and minimum length of vaddr + * @gpas: Descriptors' GPAs, if backed by guest memory */ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, hwaddr *addrs, const struct iovec *iovec, - size_t num) + size_t num, const hwaddr *gpas) { if (num == 0) { return true; } for (size_t i = 0; i < num; ++i) { - DMAMap needle = { - .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base, - .size = iovec[i].iov_len, - }; Int128 needle_last, map_last; size_t off; + const DMAMap *map; + DMAMap needle; + + /* Check if the descriptor is backed by guest memory */ + if (gpas) { + /* Search the GPA->IOVA tree */ + needle = (DMAMap) { + .translated_addr = gpas[i], + .size = iovec[i].iov_len, + }; + map = vhost_iova_tree_find_gpa(svq->iova_tree, &needle); + } else { + /* Search the IOVA->HVA tree */ + needle = (DMAMap) { + .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base, + .size = iovec[i].iov_len, + }; + map = vhost_iova_tree_find_iova(svq->iova_tree, &needle); + } - const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle); /* * Map cannot be NULL since iova map contains all guest space and * qemu already has a physical address mapped @@ -130,6 +145,7 @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, * @sg: Cache for hwaddr * @iovec: The iovec from the guest * @num: iovec length + * @addr: Descriptors' GPAs, if backed by guest memory * @more_descs: True if more descriptors come in the chain * @write: True if they are writeable descriptors * @@ -137,7 +153,8 @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, */ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, const struct iovec *iovec, size_t num, - bool more_descs, bool write) + const hwaddr *addr, bool more_descs, + bool write) { uint16_t i = svq->free_head, last = svq->free_head; unsigned n; @@ -149,7 +166,7 @@ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, return true; } - ok = vhost_svq_translate_addr(svq, sg, iovec, num); + ok = vhost_svq_translate_addr(svq, sg, iovec, num, addr); if (unlikely(!ok)) { return false; } @@ -165,17 +182,18 @@ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, descs[i].len = cpu_to_le32(iovec[n].iov_len); last = i; - i = cpu_to_le16(svq->desc_next[i]); + i = svq->desc_next[i]; } - svq->free_head = le16_to_cpu(svq->desc_next[last]); + svq->free_head = svq->desc_next[last]; return true; } static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, const struct iovec *out_sg, size_t out_num, + const hwaddr *out_addr, const struct iovec *in_sg, size_t in_num, - unsigned *head) + const hwaddr *in_addr, unsigned *head) { unsigned avail_idx; vring_avail_t *avail = svq->vring.avail; @@ -191,13 +209,14 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, return false; } - ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0, - false); + ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr, + in_num > 0, false); if (unlikely(!ok)) { return false; } - ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true); + ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr, false, + true); if (unlikely(!ok)) { return false; } @@ -228,10 +247,12 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq) smp_mb(); if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) { - uint16_t avail_event = *(uint16_t *)(&svq->vring.used->ring[svq->vring.num]); + uint16_t avail_event = le16_to_cpu( + *(uint16_t *)(&svq->vring.used->ring[svq->vring.num])); needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx, svq->shadow_avail_idx - 1); } else { - needs_kick = !(svq->vring.used->flags & VRING_USED_F_NO_NOTIFY); + needs_kick = + !(svq->vring.used->flags & cpu_to_le16(VRING_USED_F_NO_NOTIFY)); } if (!needs_kick) { @@ -247,8 +268,9 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq) * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full */ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, - size_t out_num, const struct iovec *in_sg, size_t in_num, - VirtQueueElement *elem) + size_t out_num, const hwaddr *out_addr, + const struct iovec *in_sg, size_t in_num, + const hwaddr *in_addr, VirtQueueElement *elem) { unsigned qemu_head; unsigned ndescs = in_num + out_num; @@ -258,7 +280,8 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, return -ENOSPC; } - ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head); + ok = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num, + in_addr, &qemu_head); if (unlikely(!ok)) { return -EINVAL; } @@ -274,8 +297,8 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, static int vhost_svq_add_element(VhostShadowVirtqueue *svq, VirtQueueElement *elem) { - return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg, - elem->in_num, elem); + return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->out_addr, + elem->in_sg, elem->in_num, elem->in_addr, elem); } /** @@ -365,7 +388,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq) return true; } - svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx); + svq->shadow_used_idx = le16_to_cpu(*(volatile uint16_t *)used_idx); return svq->last_used_idx != svq->shadow_used_idx; } @@ -383,7 +406,7 @@ static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq) { if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) { uint16_t *used_event = (uint16_t *)&svq->vring.avail->ring[svq->vring.num]; - *used_event = svq->shadow_used_idx; + *used_event = cpu_to_le16(svq->shadow_used_idx); } else { svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT); } @@ -408,7 +431,7 @@ static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq, uint16_t num, uint16_t i) { for (uint16_t j = 0; j < (num - 1); ++j) { - i = le16_to_cpu(svq->desc_next[i]); + i = svq->desc_next[i]; } return i; @@ -683,7 +706,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, svq->desc_state = g_new0(SVQDescState, svq->vring.num); svq->desc_next = g_new0(uint16_t, svq->vring.num); for (unsigned i = 0; i < svq->vring.num - 1; i++) { - svq->desc_next[i] = cpu_to_le16(i + 1); + svq->desc_next[i] = i + 1; } } diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index 19c842a..9c27373 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -118,8 +118,9 @@ uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq); void vhost_svq_push_elem(VhostShadowVirtqueue *svq, const VirtQueueElement *elem, uint32_t len); int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, - size_t out_num, const struct iovec *in_sg, size_t in_num, - VirtQueueElement *elem); + size_t out_num, const hwaddr *out_addr, + const struct iovec *in_sg, size_t in_num, + const hwaddr *in_addr, VirtQueueElement *elem); size_t vhost_svq_poll(VhostShadowVirtqueue *svq, size_t num); void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd); diff --git a/hw/virtio/vhost-user-snd.c b/hw/virtio/vhost-user-snd.c index 8610370..b414c75 100644 --- a/hw/virtio/vhost-user-snd.c +++ b/hw/virtio/vhost-user-snd.c @@ -16,6 +16,18 @@ #include "standard-headers/linux/virtio_ids.h" #include "standard-headers/linux/virtio_snd.h" +static const VirtIOFeature feature_sizes[] = { + {.flags = 1ULL << VIRTIO_SND_F_CTLS, + .end = endof(struct virtio_snd_config, controls)}, + {} +}; + +static const VirtIOConfigSizeParams cfg_size_params = { + .min_size = endof(struct virtio_snd_config, chmaps), + .max_size = sizeof(struct virtio_snd_config), + .feature_sizes = feature_sizes +}; + static const VMStateDescription vu_snd_vmstate = { .name = "vhost-user-snd", .unmigratable = 1, @@ -23,16 +35,20 @@ static const VMStateDescription vu_snd_vmstate = { static const Property vsnd_properties[] = { DEFINE_PROP_CHR("chardev", VHostUserBase, chardev), + DEFINE_PROP_BIT64("controls", VHostUserBase, + parent_obj.host_features, VIRTIO_SND_F_CTLS, false), }; static void vu_snd_base_realize(DeviceState *dev, Error **errp) { VHostUserBase *vub = VHOST_USER_BASE(dev); VHostUserBaseClass *vubs = VHOST_USER_BASE_GET_CLASS(dev); + VirtIODevice *vdev = &vub->parent_obj; vub->virtio_id = VIRTIO_ID_SOUND; vub->num_vqs = 4; - vub->config_size = sizeof(struct virtio_snd_config); + vub->config_size = virtio_get_config_size(&cfg_size_params, + vdev->host_features); vub->vq_size = 64; vubs->parent_realize(dev, errp); diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 3cdaa12..7efbde3 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -360,14 +360,20 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); if (s->shadow_data) { int r; + hwaddr gpa = section->offset_within_address_space; - mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr, mem_region.size = int128_get64(llsize) - 1, mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly), - r = vhost_iova_tree_map_alloc(s->iova_tree, &mem_region); + r = vhost_iova_tree_map_alloc_gpa(s->iova_tree, &mem_region, gpa); if (unlikely(r != IOVA_OK)) { error_report("Can't allocate a mapping (%d)", r); + + if (mem_region.translated_addr == gpa) { + error_report("Insertion to GPA->IOVA tree failed"); + /* Remove the mapping from the IOVA-only tree */ + goto fail_map; + } goto fail; } @@ -386,7 +392,7 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener, fail_map: if (s->shadow_data) { - vhost_iova_tree_remove(s->iova_tree, mem_region); + vhost_iova_tree_remove_gpa(s->iova_tree, mem_region); } fail: @@ -440,21 +446,18 @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener, if (s->shadow_data) { const DMAMap *result; - const void *vaddr = memory_region_get_ram_ptr(section->mr) + - section->offset_within_region + - (iova - section->offset_within_address_space); DMAMap mem_region = { - .translated_addr = (hwaddr)(uintptr_t)vaddr, + .translated_addr = section->offset_within_address_space, .size = int128_get64(llsize) - 1, }; - result = vhost_iova_tree_find_iova(s->iova_tree, &mem_region); + result = vhost_iova_tree_find_gpa(s->iova_tree, &mem_region); if (!result) { /* The memory listener map wasn't mapped */ return; } iova = result->iova; - vhost_iova_tree_remove(s->iova_tree, *result); + vhost_iova_tree_remove_gpa(s->iova_tree, *result); } vhost_vdpa_iotlb_batch_begin_once(s); /* @@ -1142,16 +1145,23 @@ static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, * * @v: Vhost-vdpa device * @needle: The area to search iova + * @taddr: The translated address (HVA) * @errorp: Error pointer */ static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, - Error **errp) + hwaddr taddr, Error **errp) { int r; - r = vhost_iova_tree_map_alloc(v->shared->iova_tree, needle); + r = vhost_iova_tree_map_alloc(v->shared->iova_tree, needle, taddr); if (unlikely(r != IOVA_OK)) { error_setg(errp, "Cannot allocate iova (%d)", r); + + if (needle->translated_addr == taddr) { + error_append_hint(errp, "Insertion to IOVA->HVA tree failed"); + /* Remove the mapping from the IOVA-only tree */ + vhost_iova_tree_remove(v->shared->iova_tree, *needle); + } return false; } @@ -1192,11 +1202,11 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, vhost_svq_get_vring_addr(svq, &svq_addr); driver_region = (DMAMap) { - .translated_addr = svq_addr.desc_user_addr, .size = driver_size - 1, .perm = IOMMU_RO, }; - ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); + ok = vhost_vdpa_svq_map_ring(v, &driver_region, svq_addr.desc_user_addr, + errp); if (unlikely(!ok)) { error_prepend(errp, "Cannot create vq driver region: "); return false; @@ -1206,11 +1216,11 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, addr->avail_user_addr = driver_region.iova + avail_offset; device_region = (DMAMap) { - .translated_addr = svq_addr.used_user_addr, .size = device_size - 1, .perm = IOMMU_RW, }; - ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); + ok = vhost_vdpa_svq_map_ring(v, &device_region, svq_addr.used_user_addr, + errp); if (unlikely(!ok)) { error_prepend(errp, "Cannot create vq device region: "); vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr); diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index ad05768..2eb5a14 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -31,7 +31,7 @@ #include "trace.h" #include "qemu/error-report.h" #include "migration/misc.h" - +#include "system/reset.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" @@ -910,6 +910,8 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) } reset_stats(s); + s->stats_last_update = 0; + qemu_register_resettable(OBJECT(dev)); } static void virtio_balloon_device_unrealize(DeviceState *dev) @@ -917,6 +919,7 @@ static void virtio_balloon_device_unrealize(DeviceState *dev) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOBalloon *s = VIRTIO_BALLOON(dev); + qemu_unregister_resettable(OBJECT(dev)); if (s->free_page_bh) { qemu_bh_delete(s->free_page_bh); object_unref(OBJECT(s->iothread)); @@ -987,6 +990,27 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) } } +static ResettableState *virtio_balloon_get_reset_state(Object *obj) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(obj); + return &s->reset_state; +} + +static void virtio_balloon_reset_enter(Object *obj, ResetType type) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(obj); + + /* + * When waking up from standby/suspend-to-ram, do not reset stats. + */ + if (type == RESET_TYPE_WAKEUP) { + return; + } + + reset_stats(s); + s->stats_last_update = 0; +} + static void virtio_balloon_instance_init(Object *obj) { VirtIOBalloon *s = VIRTIO_BALLOON(obj); @@ -1038,6 +1062,7 @@ static void virtio_balloon_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + ResettableClass *rc = RESETTABLE_CLASS(klass); device_class_set_props(dc, virtio_balloon_properties); dc->vmsd = &vmstate_virtio_balloon; @@ -1050,6 +1075,9 @@ static void virtio_balloon_class_init(ObjectClass *klass, void *data) vdc->get_features = virtio_balloon_get_features; vdc->set_status = virtio_balloon_set_status; vdc->vmsd = &vmstate_virtio_balloon_device; + + rc->get_state = virtio_balloon_get_reset_state; + rc->phases.enter = virtio_balloon_reset_enter; } static const TypeInfo virtio_balloon_info = { diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index f41104a..b6e7e01 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -1504,11 +1504,11 @@ static void virtio_iommu_device_unrealize(DeviceState *dev) virtio_cleanup(vdev); } -static void virtio_iommu_device_reset(VirtIODevice *vdev) +static void virtio_iommu_device_reset_exit(Object *obj, ResetType type) { - VirtIOIOMMU *s = VIRTIO_IOMMU(vdev); + VirtIOIOMMU *s = VIRTIO_IOMMU(obj); - trace_virtio_iommu_device_reset(); + trace_virtio_iommu_device_reset_exit(); if (s->domains) { g_tree_destroy(s->domains); @@ -1668,6 +1668,7 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + ResettableClass *rc = RESETTABLE_CLASS(klass); device_class_set_props(dc, virtio_iommu_properties); dc->vmsd = &vmstate_virtio_iommu; @@ -1675,7 +1676,12 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data) set_bit(DEVICE_CATEGORY_MISC, dc->categories); vdc->realize = virtio_iommu_device_realize; vdc->unrealize = virtio_iommu_device_unrealize; - vdc->reset = virtio_iommu_device_reset; + + /* + * Use 'exit' reset phase to make sure all DMA requests + * have been quiesced during 'enter' or 'hold' phase + */ + rc->phases.exit = virtio_iommu_device_reset_exit; vdc->get_config = virtio_iommu_get_config; vdc->set_config = virtio_iommu_set_config; vdc->get_features = virtio_iommu_get_features; diff --git a/hw/virtio/virtio-nsm.c b/hw/virtio/virtio-nsm.c index 098e1ae..b22aa74 100644 --- a/hw/virtio/virtio-nsm.c +++ b/hw/virtio/virtio-nsm.c @@ -1596,7 +1596,7 @@ static void handle_input(VirtIODevice *vdev, VirtQueue *vq) g_free(req.iov_base); g_free(res.iov_base); virtqueue_push(vq, out_elem, 0); - virtqueue_push(vq, in_elem, in_elem->in_sg->iov_len); + virtqueue_push(vq, in_elem, sz); virtio_notify(vdev, vq); return; |