diff options
79 files changed, 1502 insertions, 665 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 16af379..aa67630 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3032,6 +3032,16 @@ F: include/qemu/co-shared-resource.h T: git https://gitlab.com/jsnow/qemu.git jobs T: git https://gitlab.com/vsementsov/qemu.git block +CheckPoint and Restart (CPR) +R: Steve Sistare <steven.sistare@oracle.com> +S: Supported +F: hw/vfio/cpr* +F: include/hw/vfio/vfio-cpr.h +F: include/migration/cpr.h +F: migration/cpr* +F: tests/qtest/migration/cpr* +F: docs/devel/migration/CPR.rst + Compute Express Link M: Jonathan Cameron <jonathan.cameron@huawei.com> R: Fan Ni <fan.ni@samsung.com> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 51526d3..a317783 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -99,6 +99,7 @@ bool kvm_allowed; bool kvm_readonly_mem_allowed; bool kvm_vm_attributes_allowed; bool kvm_msi_use_devid; +bool kvm_pre_fault_memory_supported; static bool kvm_has_guest_debug; static int kvm_sstep_flags; static bool kvm_immediate_exit; @@ -2745,6 +2746,7 @@ static int kvm_init(MachineState *ms) kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) && kvm_check_extension(s, KVM_CAP_USER_MEMORY2) && (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE); + kvm_pre_fault_memory_supported = kvm_vm_check_extension(s, KVM_CAP_PRE_FAULT_MEMORY); if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; diff --git a/backends/iommufd.c b/backends/iommufd.c index b73f75c..c2c47ab 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -311,6 +311,62 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, return true; } +bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data, + Error **errp) +{ + int ret, fd = be->fd; + uint32_t total_entries = *entry_num; + struct iommu_hwpt_invalidate cache = { + .size = sizeof(cache), + .hwpt_id = id, + .data_type = data_type, + .entry_len = entry_len, + .entry_num = total_entries, + .data_uptr = (uintptr_t)data, + }; + + ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); + trace_iommufd_backend_invalidate_cache(fd, id, data_type, entry_len, + total_entries, cache.entry_num, + (uintptr_t)data, ret ? errno : 0); + *entry_num = cache.entry_num; + + if (ret) { + error_setg_errno(errp, errno, "IOMMU_HWPT_INVALIDATE failed:" + " total %d entries, processed %d entries", + total_entries, cache.entry_num); + } else if (total_entries != cache.entry_num) { + error_setg(errp, "IOMMU_HWPT_INVALIDATE succeed but with unprocessed" + " entries: total %d entries, processed %d entries." + " Kernel BUG?!", total_entries, cache.entry_num); + return false; + } + + return !ret; +} + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->attach_hwpt); + return idevc->attach_hwpt(idev, hwpt_id, errp); +} + +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->detach_hwpt); + return idevc->detach_hwpt(idev, errp); +} + static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { HostIOMMUDeviceCaps *caps = &hiod->caps; @@ -349,6 +405,8 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, .parent = TYPE_HOST_IOMMU_DEVICE, + .instance_size = sizeof(HostIOMMUDeviceIOMMUFD), + .class_size = sizeof(HostIOMMUDeviceIOMMUFDClass), .class_init = hiod_iommufd_class_init, .abstract = true, } diff --git a/backends/trace-events b/backends/trace-events index 40811a3..7278214 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -18,3 +18,4 @@ iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_ iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" +iommufd_backend_invalidate_cache(int iommufd, uint32_t id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" diff --git a/docs/devel/rust.rst b/docs/devel/rust.rst index 34d9c79..47e9677 100644 --- a/docs/devel/rust.rst +++ b/docs/devel/rust.rst @@ -96,6 +96,11 @@ are missing: architecture (VMState). Right now, VMState lacks type safety because it is hard to place the ``VMStateField`` definitions in traits. +* NUL-terminated file names with ``#[track_caller]`` are scheduled for + inclusion as ``#![feature(location_file_nul)]``, but it will be a while + before QEMU can use them. For now, there is special code in + ``util/error.c`` to support non-NUL-terminated file names. + * associated const equality would be nice to have for some users of ``callbacks::FnCall``, but is still experimental. ``ASSERT_IS_SOME`` replaces it. @@ -155,10 +160,10 @@ module status ``callbacks`` complete ``cell`` stable ``errno`` complete +``error`` stable ``irq`` complete ``memory`` stable ``module`` complete -``offset_of`` stable ``qdev`` stable ``qom`` stable ``sysbus`` stable diff --git a/docs/sphinx/qapi_domain.py b/docs/sphinx/qapi_domain.py index c94af57..ebc46a7 100644 --- a/docs/sphinx/qapi_domain.py +++ b/docs/sphinx/qapi_domain.py @@ -20,16 +20,6 @@ from typing import ( from docutils import nodes from docutils.parsers.rst import directives - -from compat import ( - CompatField, - CompatGroupedField, - CompatTypedField, - KeywordNode, - ParserFix, - Signature, - SpaceNode, -) from sphinx import addnodes from sphinx.directives import ObjectDescription from sphinx.domains import ( @@ -44,6 +34,16 @@ from sphinx.util import logging from sphinx.util.docutils import SphinxDirective from sphinx.util.nodes import make_id, make_refnode +from compat import ( + CompatField, + CompatGroupedField, + CompatTypedField, + KeywordNode, + ParserFix, + Signature, + SpaceNode, +) + if TYPE_CHECKING: from typing import ( @@ -56,7 +56,6 @@ if TYPE_CHECKING: ) from docutils.nodes import Element, Node - from sphinx.addnodes import desc_signature, pending_xref from sphinx.application import Sphinx from sphinx.builders import Builder @@ -168,6 +167,8 @@ class QAPIDescription(ParserFix): """ def handle_signature(self, sig: str, signode: desc_signature) -> Signature: + # pylint: disable=unused-argument + # Do nothing. The return value here is the "name" of the entity # being documented; for QAPI, this is the same as the # "signature", which is just a name. @@ -210,6 +211,8 @@ class QAPIDescription(ParserFix): def add_target_and_index( self, name: Signature, sig: str, signode: desc_signature ) -> None: + # pylint: disable=unused-argument + # name is the return value of handle_signature. # sig is the original, raw text argument to handle_signature. # For QAPI, these are identical, currently. diff --git a/docs/sphinx/qapidoc.py b/docs/sphinx/qapidoc.py index 661b2c4..8011ac9 100644 --- a/docs/sphinx/qapidoc.py +++ b/docs/sphinx/qapidoc.py @@ -27,6 +27,7 @@ https://www.sphinx-doc.org/en/master/development/index.html from __future__ import annotations + __version__ = "2.0" from contextlib import contextmanager @@ -56,8 +57,6 @@ from qapi.schema import ( QAPISchemaVisitor, ) from qapi.source import QAPISourceInfo - -from qapidoc_legacy import QAPISchemaGenRSTVisitor # type: ignore from sphinx import addnodes from sphinx.directives.code import CodeBlock from sphinx.errors import ExtensionError @@ -65,6 +64,8 @@ from sphinx.util import logging from sphinx.util.docutils import SphinxDirective, switch_source_input from sphinx.util.nodes import nested_parse_with_titles +from qapidoc_legacy import QAPISchemaGenRSTVisitor # type: ignore + if TYPE_CHECKING: from typing import ( diff --git a/hw/display/apple-gfx.m b/hw/display/apple-gfx.m index 8dde1f1..174d56a 100644 --- a/hw/display/apple-gfx.m +++ b/hw/display/apple-gfx.m @@ -454,7 +454,7 @@ static void set_cursor_glyph(void *opaque) /* ------ DMA (device reading system memory) ------ */ typedef struct AppleGFXReadMemoryJob { - QemuSemaphore sem; + QemuEvent event; hwaddr physical_address; uint64_t length; void *dst; @@ -470,7 +470,7 @@ static void apple_gfx_do_read_memory(void *opaque) job->dst, job->length, MEMTXATTRS_UNSPECIFIED); job->success = (r == MEMTX_OK); - qemu_sem_post(&job->sem); + qemu_event_set(&job->event); } static bool apple_gfx_read_memory(AppleGFXState *s, hwaddr physical_address, @@ -483,11 +483,11 @@ static bool apple_gfx_read_memory(AppleGFXState *s, hwaddr physical_address, trace_apple_gfx_read_memory(physical_address, length, dst); /* Performing DMA requires BQL, so do it in a BH. */ - qemu_sem_init(&job.sem, 0); + qemu_event_init(&job.event, 0); aio_bh_schedule_oneshot(qemu_get_aio_context(), apple_gfx_do_read_memory, &job); - qemu_sem_wait(&job.sem); - qemu_sem_destroy(&job.sem); + qemu_event_wait(&job.event); + qemu_event_destroy(&job.event); return job.success; } diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c index 0fd1337..cb48cc1 100644 --- a/hw/timer/hpet.c +++ b/hw/timer/hpet.c @@ -328,16 +328,16 @@ static const VMStateDescription vmstate_hpet_timer = { static const VMStateDescription vmstate_hpet = { .name = "hpet", .version_id = 2, - .minimum_version_id = 1, + .minimum_version_id = 2, .pre_save = hpet_pre_save, .post_load = hpet_post_load, .fields = (const VMStateField[]) { VMSTATE_UINT64(config, HPETState), VMSTATE_UINT64(isr, HPETState), VMSTATE_UINT64(hpet_counter, HPETState), - VMSTATE_UINT8_V(num_timers_save, HPETState, 2), + VMSTATE_UINT8(num_timers_save, HPETState), VMSTATE_VALIDATE("num_timers must match", hpet_validate_num_timers), - VMSTATE_STRUCT_VARRAY_UINT8(timer, HPETState, num_timers, 0, + VMSTATE_STRUCT_VARRAY_UINT8(timer, HPETState, num_timers_save, 0, vmstate_hpet_timer, HPETTimer), VMSTATE_END_OF_LIST() }, @@ -691,8 +691,14 @@ static void hpet_realize(DeviceState *dev, Error **errp) int i; HPETTimer *timer; + if (s->num_timers < HPET_MIN_TIMERS || s->num_timers > HPET_MAX_TIMERS) { + error_setg(errp, "hpet.num_timers must be between %d and %d", + HPET_MIN_TIMERS, HPET_MAX_TIMERS); + return; + } if (!s->intcap) { - warn_report("Hpet's intcap not initialized"); + error_setg(errp, "hpet.hpet-intcap not initialized"); + return; } if (hpet_fw_cfg.count == UINT8_MAX) { /* first instance */ @@ -700,7 +706,7 @@ static void hpet_realize(DeviceState *dev, Error **errp) } if (hpet_fw_cfg.count == 8) { - error_setg(errp, "Only 8 instances of HPET is allowed"); + error_setg(errp, "Only 8 instances of HPET are allowed"); return; } @@ -710,11 +716,6 @@ static void hpet_realize(DeviceState *dev, Error **errp) sysbus_init_irq(sbd, &s->irqs[i]); } - if (s->num_timers < HPET_MIN_TIMERS) { - s->num_timers = HPET_MIN_TIMERS; - } else if (s->num_timers > HPET_MAX_TIMERS) { - s->num_timers = HPET_MAX_TIMERS; - } for (i = 0; i < HPET_MAX_TIMERS; i++) { timer = &s->timer[i]; timer->qemu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, hpet_timer, timer); diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 1c6ca94..d834bd4 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -75,12 +75,12 @@ void vfio_address_space_insert(VFIOAddressSpace *space, int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly) + void *vaddr, bool readonly, MemoryRegion *mr) { VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); g_assert(vioc->dma_map); - return vioc->dma_map(bcontainer, iova, size, vaddr, readonly); + return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr); } int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, diff --git a/hw/vfio/container.c b/hw/vfio/container.c index a9f0dba..0f948d0 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -33,8 +33,8 @@ #include "qapi/error.h" #include "pci.h" #include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-cpr.h" #include "vfio-helpers.h" -#include "vfio-cpr.h" #include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" @@ -207,7 +207,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, } static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 3214184..0210e76 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -8,9 +8,9 @@ #include "qemu/osdep.h" #include "hw/vfio/vfio-device.h" #include "migration/misc.h" +#include "hw/vfio/vfio-cpr.h" #include "qapi/error.h" #include "system/runstate.h" -#include "vfio-cpr.h" static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, MigrationEvent *e, Error **errp) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index e7952d1..e7a9d1f 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -187,23 +187,21 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, } static bool vfio_pci_igd_opregion_detect(VFIOPCIDevice *vdev, - struct vfio_region_info **opregion, - Error **errp) + struct vfio_region_info **opregion) { int ret; - /* Hotplugging is not supported for opregion access */ - if (vdev->pdev.qdev.hotplugged) { - error_setg(errp, "IGD OpRegion is not supported on hotplugged device"); - return false; - } - ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, opregion); if (ret) { - error_setg_errno(errp, -ret, - "Device does not supports IGD OpRegion feature"); + return false; + } + + /* Hotplugging is not supported for opregion access */ + if (vdev->pdev.qdev.hotplugged) { + warn_report("IGD device detected, but OpRegion is not supported " + "on hotplugged device."); return false; } @@ -524,7 +522,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) } /* IGD device always comes with OpRegion */ - if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { return true; } info_report("OpRegion detected on Intel display %x.", vdev->device_id); @@ -695,7 +693,7 @@ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) return true; } - if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { /* Should never reach here, KVMGT always emulates OpRegion */ return false; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index af1c7ab..d3efef7 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -21,20 +21,21 @@ #include "qapi/error.h" #include "system/iommufd.h" #include "hw/qdev-core.h" +#include "hw/vfio/vfio-cpr.h" #include "system/reset.h" #include "qemu/cutils.h" #include "qemu/chardev_open.h" #include "pci.h" #include "vfio-iommufd.h" #include "vfio-helpers.h" -#include "vfio-cpr.h" #include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); @@ -592,6 +593,10 @@ found_container: goto err_listener_register; } + /* + * Do not move this code before attachment! The nested IOMMU support + * needs device and hwpt id which are generated only after attachment. + */ if (!vfio_device_hiod_create_and_realize(vbasedev, TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, errp)) { goto err_listener_register; @@ -810,21 +815,38 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool +host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); +} + +static bool +host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); +} + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, Error **errp) { VFIODevice *vdev = opaque; + HostIOMMUDeviceIOMMUFD *idev; HostIOMMUDeviceCaps *caps = &hiod->caps; + VendorCaps *vendor_caps = &caps->vendor_caps; enum iommu_hw_info_type type; - union { - struct iommu_hw_info_vtd vtd; - } data; uint64_t hw_caps; hiod->agent = opaque; - if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, + vendor_caps, sizeof(*vendor_caps), &hw_caps, errp)) { return false; } @@ -833,6 +855,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, caps->type = type; caps->hw_caps = hw_caps; + idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + idev->iommufd = vdev->iommufd; + idev->devid = vdev->devid; + idev->hwpt_id = vdev->hwpt->hwpt_id; + return true; } @@ -858,10 +885,14 @@ hiod_iommufd_vfio_get_page_size_mask(HostIOMMUDevice *hiod) static void hiod_iommufd_vfio_class_init(ObjectClass *oc, const void *data) { HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; hiodc->get_iova_ranges = hiod_iommufd_vfio_get_iova_ranges; hiodc->get_page_size_mask = hiod_iommufd_vfio_get_page_size_mask; + + idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; + idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; }; static const TypeInfo types[] = { diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index bfacb3d..203ed03 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -90,16 +90,17 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } -/* Called with rcu_read_lock held. */ -static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - Error **errp) +/* + * Called with rcu_read_lock held. + * The returned MemoryRegion must not be accessed after calling rcu_read_unlock. + */ +static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp) { - bool ret, mr_has_discard_manager; + MemoryRegion *mr; - ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, - &mr_has_discard_manager, errp); - if (ret && mr_has_discard_manager) { + mr = memory_translate_iotlb(iotlb, xlat_p, errp); + if (mr && memory_region_has_ram_discard_manager(mr)) { /* * Malicious VMs might trigger discarding of IOMMU-mapped memory. The * pages will remain pinned inside vfio until unmapped, resulting in a @@ -118,7 +119,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, " intended via an IOMMU. It's possible to mitigate " " by setting/adjusting RLIMIT_MEMLOCK."); } - return ret; + return mr; } static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) @@ -126,6 +127,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; + MemoryRegion *mr; + hwaddr xlat; void *vaddr; int ret; Error *local_err = NULL; @@ -150,10 +153,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); goto out; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + /* * vaddr is only valid until rcu_read_unlock(). But after * vfio_dma_map has set up the mapping the pages will be @@ -163,7 +170,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) */ ret = vfio_container_dma_map(bcontainer, iova, iotlb->addr_mask + 1, vaddr, - read_only); + read_only, mr); if (ret) { error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -233,7 +240,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, vaddr = memory_region_get_ram_ptr(section->mr) + start; ret = vfio_container_dma_map(bcontainer, iova, next - start, - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -449,6 +456,26 @@ static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) } } +VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainerBase *bcontainer, MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + /* does not return */ + } + return vrdl; +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { @@ -557,7 +584,7 @@ static void vfio_listener_region_add(MemoryListener *listener, } ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -1010,6 +1037,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) ram_addr_t translated_addr; Error *local_err = NULL; int ret = -EINVAL; + MemoryRegion *mr; + hwaddr xlat; trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); @@ -1021,9 +1050,11 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) } rcu_read_lock(); - if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { goto out_unlock; } + translated_addr = memory_region_get_ram_addr(mr) + xlat; ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, translated_addr, &local_err); @@ -1075,19 +1106,8 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to sync missing RAM discard listener"); - } + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); /* * We only want/can synchronize the bitmap for actually mapped parts - diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index a1bfdfe..b1250d8 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -511,6 +511,25 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, + unsigned int nr) +{ + Error *err = NULL; + int32_t fd; + + if (vector->virq >= 0) { + fd = event_notifier_get_fd(&vector->kvm_interrupt); + } else { + fd = event_notifier_get_fd(&vector->interrupt); + } + + if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, + VFIO_IRQ_SET_ACTION_TRIGGER, + fd, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); + } +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { @@ -583,21 +602,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, strerror(-ret)); } } else { - Error *err = NULL; - int32_t fd; - - if (vector->virq >= 0) { - fd = event_notifier_get_fd(&vector->kvm_interrupt); - } else { - fd = event_notifier_get_fd(&vector->interrupt); - } - - if (!vfio_device_irq_set_signaling(&vdev->vbasedev, - VFIO_PCI_MSIX_IRQ_INDEX, nr, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, - &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - } + set_irq_signalling(&vdev->vbasedev, vector, nr); } } @@ -2854,6 +2859,18 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) static void vfio_pci_put_device(VFIOPCIDevice *vdev) { + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* + * XXX Leaking igd_opregion is not an oversight, we can't remove the + * fw_cfg entry therefore leaking this allocation seems like the safest + * option. + * + * g_free(vdev->igd_opregion); + */ + vfio_device_detach(&vdev->vbasedev); g_free(vdev->vbasedev.name); @@ -3005,6 +3022,19 @@ static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; VFIODevice *vbasedev = &vdev->vbasedev; + uint32_t config_space_size; + int ret; + + config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); + + /* Get a copy of config space */ + ret = vfio_pci_config_space_read(vdev, 0, config_space_size, + vdev->pdev.config); + if (ret < (int)config_space_size) { + ret = ret < 0 ? -ret : EFAULT; + error_setg_errno(errp, ret, "failed to read device config space"); + return false; + } /* vfio emulates a lot for us, but some bits need extra love */ vdev->emulated_config_bits = g_malloc0(vdev->config_size); @@ -3126,15 +3156,14 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_realize(PCIDevice *pdev, Error **errp) +static void vfio_pci_realize(PCIDevice *pdev, Error **errp) { ERRP_GUARD(); VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; - int i, ret; + int i; char uuid[UUID_STR_LEN]; g_autofree char *name = NULL; - uint32_t config_space_size; if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || @@ -3189,17 +3218,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } - config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); - - /* Get a copy of config space */ - ret = vfio_pci_config_space_read(vdev, 0, config_space_size, - vdev->pdev.config); - if (ret < (int)config_space_size) { - ret = ret < 0 ? -ret : EFAULT; - error_setg_errno(errp, ret, "failed to read device config space"); - goto error; - } - if (!vfio_pci_config_setup(vdev, errp)) { goto error; } @@ -3302,17 +3320,6 @@ static void vfio_instance_finalize(Object *obj) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); - vfio_display_finalize(vdev); - vfio_bars_finalize(vdev); - g_free(vdev->emulated_config_bits); - g_free(vdev->rom); - /* - * XXX Leaking igd_opregion is not an oversight, we can't remove the - * fw_cfg entry therefore leaking this allocation seems like the safest - * option. - * - * g_free(vdev->igd_opregion); - */ vfio_pci_put_device(vdev); } @@ -3514,7 +3521,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif dc->desc = "VFIO-based PCI device assignment"; - pdc->realize = vfio_realize; + pdc->realize = vfio_pci_realize; object_class_property_set_description(klass, /* 1.3 */ "host", diff --git a/hw/vfio/vfio-cpr.h b/hw/vfio/vfio-cpr.h deleted file mode 100644 index 134b83a..0000000 --- a/hw/vfio/vfio-cpr.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * VFIO CPR - * - * Copyright (c) 2025 Oracle and/or its affiliates. - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ - -#ifndef HW_VFIO_CPR_H -#define HW_VFIO_CPR_H - -bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp); -void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer); - -#endif /* HW_VFIO_CPR_H */ diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index e20da95..7061b6e 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -209,6 +209,8 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) int ret; Int128 llend; Error *local_err = NULL; + MemoryRegion *mr; + hwaddr xlat; if (iotlb->target_as != &address_space_memory) { error_report("Wrong target AS \"%s\", only system memory is allowed", @@ -228,11 +230,14 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, - &local_err)) { + mr = memory_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); return; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova, iotlb->addr_mask + 1, vaddr, read_only); if (ret) { diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 3d392b0..9d37f86 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -78,7 +78,7 @@ void vfio_address_space_insert(VFIOAddressSpace *space, int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly); + void *vaddr, bool readonly, MemoryRegion *mr); int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb, bool unmap_all); @@ -115,13 +115,57 @@ OBJECT_DECLARE_TYPE(VFIOContainerBase, VFIOIOMMUClass, VFIO_IOMMU) struct VFIOIOMMUClass { ObjectClass parent_class; - /* basic feature */ + /** + * @setup + * + * Perform basic setup of the container, including configuring IOMMU + * capabilities, IOVA ranges, supported page sizes, etc. + * + * @bcontainer: #VFIOContainerBase + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for error. + */ bool (*setup)(VFIOContainerBase *bcontainer, Error **errp); + + /** + * @listener_begin + * + * Called at the beginning of an address space update transaction. + * See #MemoryListener. + * + * @bcontainer: #VFIOContainerBase + */ void (*listener_begin)(VFIOContainerBase *bcontainer); + + /** + * @listener_commit + * + * Called at the end of an address space update transaction, + * See #MemoryListener. + * + * @bcontainer: #VFIOContainerBase + */ void (*listener_commit)(VFIOContainerBase *bcontainer); + + /** + * @dma_map + * + * Map an address range into the container. Note that the memory region is + * referenced within an RCU read lock region across this call. + * + * @bcontainer: #VFIOContainerBase to use + * @iova: start address to map + * @size: size of the range to map + * @vaddr: process virtual address of mapping + * @readonly: true if mapping should be readonly + * @mr: the memory region for this mapping + * + * Returns 0 to indicate success and -errno otherwise. + */ int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly); + void *vaddr, bool readonly, MemoryRegion *mr); /** * @dma_unmap * @@ -132,12 +176,38 @@ struct VFIOIOMMUClass { * @size: size of the range to unmap * @iotlb: The IOMMU TLB mapping entry (or NULL) * @unmap_all: if set, unmap the entire address space + * + * Returns 0 to indicate success and -errno otherwise. */ int (*dma_unmap)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb, bool unmap_all); + + + /** + * @attach_device + * + * Associate the given device with a container and do some related + * initialization of the device context. + * + * @name: name of the device + * @vbasedev: the device + * @as: address space to use + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for error. + */ bool (*attach_device)(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); + + /* + * @detach_device + * + * Detach the given device from its container and clean up any necessary + * state. + * + * @vbasedev: the device to disassociate + */ void (*detach_device)(VFIODevice *vbasedev); /* migration feature */ @@ -152,7 +222,7 @@ struct VFIOIOMMUClass { * @start: indicates whether to start or stop dirty pages tracking * @errp: pointer to Error*, to store an error if it happens. * - * Returns zero to indicate success and negative for error + * Returns zero to indicate success and negative for error. */ int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer, bool start, Error **errp); @@ -167,7 +237,7 @@ struct VFIOIOMMUClass { * @size: size of iova range * @errp: pointer to Error*, to store an error if it happens. * - * Returns zero to indicate success and negative for error + * Returns zero to indicate success and negative for error. */ int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp); @@ -183,4 +253,7 @@ struct VFIOIOMMUClass { void (*release)(VFIOContainerBase *bcontainer); }; +VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainerBase *bcontainer, MemoryRegionSection *section); + #endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h new file mode 100644 index 0000000..750ea5b --- /dev/null +++ b/include/hw/vfio/vfio-cpr.h @@ -0,0 +1,18 @@ +/* + * VFIO CPR + * + * Copyright (c) 2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_CPR_H +#define HW_VFIO_VFIO_CPR_H + +struct VFIOContainerBase; + +bool vfio_cpr_register_container(struct VFIOContainerBase *bcontainer, + Error **errp); +void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer); + +#endif /* HW_VFIO_VFIO_CPR_H */ diff --git a/include/qapi/error-internal.h b/include/qapi/error-internal.h new file mode 100644 index 0000000..ff18a20 --- /dev/null +++ b/include/qapi/error-internal.h @@ -0,0 +1,35 @@ +/* + * QEMU Error Objects - struct definition + * + * Copyright IBM, Corp. 2011 + * Copyright (C) 2011-2015 Red Hat, Inc. + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2. See + * the COPYING.LIB file in the top-level directory. + */ + +#ifndef QAPI_ERROR_INTERNAL_H + +struct Error +{ + char *msg; + ErrorClass err_class; + + /* Used for error_abort only, may be NULL. */ + const char *func; + + /* + * src might be NUL-terminated or not. If it is, src_len is negative. + * If it is not, src_len is the length. + */ + const char *src; + int src_len; + int line; + GString *hint; +}; + +#endif diff --git a/include/qemu/futex.h b/include/qemu/futex.h index 91ae889..607613e 100644 --- a/include/qemu/futex.h +++ b/include/qemu/futex.h @@ -1,5 +1,5 @@ /* - * Wrappers around Linux futex syscall + * Wrappers around Linux futex syscall and similar * * Copyright Red Hat, Inc. 2017 * @@ -11,17 +11,35 @@ * */ +/* + * Note that a wake-up can also be caused by common futex usage patterns in + * unrelated code that happened to have previously used the futex word's + * memory location (e.g., typical futex-based implementations of Pthreads + * mutexes can cause this under some conditions). Therefore, qemu_futex_wait() + * callers should always conservatively assume that it is a spurious wake-up, + * and use the futex word's value (i.e., the user-space synchronization scheme) + * to decide whether to continue to block or not. + */ + #ifndef QEMU_FUTEX_H #define QEMU_FUTEX_H +#define HAVE_FUTEX + +#ifdef CONFIG_LINUX #include <sys/syscall.h> #include <linux/futex.h> #define qemu_futex(...) syscall(__NR_futex, __VA_ARGS__) -static inline void qemu_futex_wake(void *f, int n) +static inline void qemu_futex_wake_all(void *f) { - qemu_futex(f, FUTEX_WAKE, n, NULL, NULL, 0); + qemu_futex(f, FUTEX_WAKE, INT_MAX, NULL, NULL, 0); +} + +static inline void qemu_futex_wake_single(void *f) +{ + qemu_futex(f, FUTEX_WAKE, 1, NULL, NULL, 0); } static inline void qemu_futex_wait(void *f, unsigned val) @@ -37,5 +55,25 @@ static inline void qemu_futex_wait(void *f, unsigned val) } } } +#elif defined(CONFIG_WIN32) +#include <synchapi.h> + +static inline void qemu_futex_wake_all(void *f) +{ + WakeByAddressAll(f); +} + +static inline void qemu_futex_wake_single(void *f) +{ + WakeByAddressSingle(f); +} + +static inline void qemu_futex_wait(void *f, unsigned val) +{ + WaitOnAddress(f, &val, sizeof(val), INFINITE); +} +#else +#undef HAVE_FUTEX +#endif #endif /* QEMU_FUTEX_H */ diff --git a/include/qemu/lockcnt.h b/include/qemu/lockcnt.h index f4b62a3..5a2800e 100644 --- a/include/qemu/lockcnt.h +++ b/include/qemu/lockcnt.h @@ -17,7 +17,7 @@ typedef struct QemuLockCnt QemuLockCnt; struct QemuLockCnt { -#ifndef CONFIG_LINUX +#ifndef HAVE_FUTEX QemuMutex mutex; #endif unsigned count; diff --git a/include/qemu/thread-posix.h b/include/qemu/thread-posix.h index 5f2f3d1..758808b 100644 --- a/include/qemu/thread-posix.h +++ b/include/qemu/thread-posix.h @@ -32,15 +32,6 @@ struct QemuSemaphore { unsigned int count; }; -struct QemuEvent { -#ifndef __linux__ - pthread_mutex_t lock; - pthread_cond_t cond; -#endif - unsigned value; - bool initialized; -}; - struct QemuThread { pthread_t thread; }; diff --git a/include/qemu/thread-win32.h b/include/qemu/thread-win32.h index d95af44..da9e732 100644 --- a/include/qemu/thread-win32.h +++ b/include/qemu/thread-win32.h @@ -28,12 +28,6 @@ struct QemuSemaphore { bool initialized; }; -struct QemuEvent { - int value; - HANDLE event; - bool initialized; -}; - typedef struct QemuThreadData QemuThreadData; struct QemuThread { QemuThreadData *data; diff --git a/include/qemu/thread.h b/include/qemu/thread.h index 6f800aa..f0302ed 100644 --- a/include/qemu/thread.h +++ b/include/qemu/thread.h @@ -3,13 +3,32 @@ #include "qemu/processor.h" #include "qemu/atomic.h" +#include "qemu/futex.h" typedef struct QemuCond QemuCond; typedef struct QemuSemaphore QemuSemaphore; -typedef struct QemuEvent QemuEvent; typedef struct QemuLockCnt QemuLockCnt; typedef struct QemuThread QemuThread; +/* + * QemuEvent + * ========= + * + * QemuEvent is an implementation of Win32 manual-reset event object. + * For details, refer to: + * https://learn.microsoft.com/en-us/windows/win32/sync/using-event-objects + * + * QemuEvent is more lightweight than QemuSemaphore when HAVE_FUTEX is defined. + */ +typedef struct QemuEvent { +#ifndef HAVE_FUTEX + pthread_mutex_t lock; + pthread_cond_t cond; +#endif + unsigned value; + bool initialized; +} QemuEvent; + #ifdef _WIN32 #include "qemu/thread-win32.h" #else diff --git a/include/system/host_iommu_device.h b/include/system/host_iommu_device.h index 809cced..ab849a4 100644 --- a/include/system/host_iommu_device.h +++ b/include/system/host_iommu_device.h @@ -14,6 +14,13 @@ #include "qom/object.h" #include "qapi/error.h" +#ifdef CONFIG_LINUX +#include "linux/iommufd.h" + +typedef union VendorCaps { + struct iommu_hw_info_vtd vtd; + struct iommu_hw_info_arm_smmuv3 smmuv3; +} VendorCaps; /** * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. @@ -22,11 +29,17 @@ * * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) + * + * @vendor_caps: host platform IOMMU vendor specific capabilities (e.g. on + * IOMMUFD this represents a user-space buffer filled by kernel + * with host IOMMU @type specific hardware information data) */ typedef struct HostIOMMUDeviceCaps { uint32_t type; uint64_t hw_caps; + VendorCaps vendor_caps; } HostIOMMUDeviceCaps; +#endif #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) @@ -38,7 +51,9 @@ struct HostIOMMUDevice { void *agent; /* pointer to agent device, ie. VFIO or VDPA device */ PCIBus *aliased_bus; int aliased_devfn; +#ifdef CONFIG_LINUX HostIOMMUDeviceCaps caps; +#endif }; /** diff --git a/include/system/iommufd.h b/include/system/iommufd.h index cbab75b..283861b 100644 --- a/include/system/iommufd.h +++ b/include/system/iommufd.h @@ -61,6 +61,60 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, uint64_t iova, ram_addr_t size, uint64_t page_size, uint64_t *data, Error **errp); +bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" +OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, + HOST_IOMMU_DEVICE_IOMMUFD) + +/* Overload of the host IOMMU device for the iommufd backend */ +struct HostIOMMUDeviceIOMMUFD { + HostIOMMUDevice parent_obj; + + IOMMUFDBackend *iommufd; + uint32_t devid; + uint32_t hwpt_id; +}; + +struct HostIOMMUDeviceIOMMUFDClass { + HostIOMMUDeviceClass parent_class; + + /** + * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @hwpt_id: ID of IOMMUFD hardware page table. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, + Error **errp); + /** + * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp); +}; + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp); +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp); #endif diff --git a/include/system/kvm.h b/include/system/kvm.h index 62ec131..7cc60d2 100644 --- a/include/system/kvm.h +++ b/include/system/kvm.h @@ -42,6 +42,7 @@ extern bool kvm_gsi_routing_allowed; extern bool kvm_gsi_direct_mapping; extern bool kvm_readonly_mem_allowed; extern bool kvm_msi_use_devid; +extern bool kvm_pre_fault_memory_supported; #define kvm_enabled() (kvm_allowed) /** diff --git a/include/system/memory.h b/include/system/memory.h index fc35a0d..0848690 100644 --- a/include/system/memory.h +++ b/include/system/memory.h @@ -739,21 +739,20 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, RamDiscardListener *rdl); /** - * memory_get_xlat_addr: Extract addresses from a TLB entry + * memory_translate_iotlb: Extract addresses from a TLB entry. + * Called with rcu_read_lock held. * * @iotlb: pointer to an #IOMMUTLBEntry - * @vaddr: virtual address - * @ram_addr: RAM address - * @read_only: indicates if writes are allowed - * @mr_has_discard_manager: indicates memory is controlled by a - * RamDiscardManager + * @xlat_p: return the offset of the entry from the start of the returned + * MemoryRegion. * @errp: pointer to Error*, to store an error if it happens. * - * Return: true on success, else false setting @errp with error. + * Return: On success, return the MemoryRegion containing the @iotlb translated + * addr. The MemoryRegion must not be accessed after rcu_read_unlock. + * On failure, return NULL, setting @errp with error. */ -bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - bool *mr_has_discard_manager, Error **errp); +MemoryRegion *memory_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp); typedef struct CoalescedMemoryRange CoalescedMemoryRange; typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; diff --git a/meson.build b/meson.build index 967a10e..34729c2 100644 --- a/meson.build +++ b/meson.build @@ -838,11 +838,18 @@ emulator_link_args = [] midl = not_found widl = not_found pathcch = not_found +synchronization = not_found host_dsosuf = '.so' if host_os == 'windows' midl = find_program('midl', required: false) widl = find_program('widl', required: false) pathcch = cc.find_library('pathcch') + synchronization = cc.find_library('Synchronization', required: false) + if not synchronization.found() + # The library name is lowercase on mingw + synchronization = cc.find_library('synchronization', required: true) + endif + socket = cc.find_library('ws2_32') winmm = cc.find_library('winmm') diff --git a/migration/colo.c b/migration/colo.c index c976b3f..e0f713c 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -146,7 +146,7 @@ static void secondary_vm_do_failover(void) return; } /* Notify COLO incoming thread that failover work is finished */ - qemu_sem_post(&mis->colo_incoming_sem); + qemu_event_set(&mis->colo_incoming_event); /* For Secondary VM, jump to incoming co */ if (mis->colo_incoming_co) { @@ -195,7 +195,7 @@ static void primary_vm_do_failover(void) } /* Notify COLO thread that failover work is finished */ - qemu_sem_post(&s->colo_exit_sem); + qemu_event_set(&s->colo_exit_event); } COLOMode get_colo_mode(void) @@ -620,8 +620,8 @@ out: } /* Hope this not to be too long to wait here */ - qemu_sem_wait(&s->colo_exit_sem); - qemu_sem_destroy(&s->colo_exit_sem); + qemu_event_wait(&s->colo_exit_event); + qemu_event_destroy(&s->colo_exit_event); /* * It is safe to unregister notifier after failover finished. @@ -651,7 +651,7 @@ void migrate_start_colo_process(MigrationState *s) s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, colo_checkpoint_notify_timer, NULL); - qemu_sem_init(&s->colo_exit_sem, 0); + qemu_event_init(&s->colo_exit_event, false); colo_process_checkpoint(s); bql_lock(); } @@ -808,11 +808,11 @@ void colo_shutdown(void) case COLO_MODE_PRIMARY: s = migrate_get_current(); qemu_event_set(&s->colo_checkpoint_event); - qemu_sem_post(&s->colo_exit_sem); + qemu_event_set(&s->colo_exit_event); break; case COLO_MODE_SECONDARY: mis = migration_incoming_get_current(); - qemu_sem_post(&mis->colo_incoming_sem); + qemu_event_set(&mis->colo_incoming_event); break; default: break; @@ -827,7 +827,7 @@ static void *colo_process_incoming_thread(void *opaque) Error *local_err = NULL; rcu_register_thread(); - qemu_sem_init(&mis->colo_incoming_sem, 0); + qemu_event_init(&mis->colo_incoming_event, false); migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); @@ -923,8 +923,8 @@ out: } /* Hope this not to be too long to loop here */ - qemu_sem_wait(&mis->colo_incoming_sem); - qemu_sem_destroy(&mis->colo_incoming_sem); + qemu_event_wait(&mis->colo_incoming_event); + qemu_event_destroy(&mis->colo_incoming_event); rcu_unregister_thread(); return NULL; diff --git a/migration/migration.c b/migration/migration.c index 4697732..4098870 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1630,7 +1630,7 @@ void migration_cancel(void) } /* If the migration is paused, kick it out of the pause */ if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) { - qemu_sem_post(&s->pause_sem); + qemu_event_set(&s->pause_event); } migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING); } while (s->state != MIGRATION_STATUS_CANCELLING); @@ -2342,7 +2342,7 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp) MigrationStatus_str(s->state)); return; } - qemu_sem_post(&s->pause_sem); + qemu_event_set(&s->pause_event); } int migration_rp_wait(MigrationState *s) @@ -2911,21 +2911,18 @@ static bool migration_switchover_prepare(MigrationState *s) return true; } - /* Since leaving this state is not atomic with posting the semaphore + /* + * Since leaving this state is not atomic with setting the event * it's possible that someone could have issued multiple migrate_continue - * and the semaphore is incorrectly positive at this point; - * the docs say it's undefined to reinit a semaphore that's already - * init'd, so use timedwait to eat up any existing posts. + * and the event is incorrectly set at this point so reset it. */ - while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) { - /* This block intentionally left blank */ - } + qemu_event_reset(&s->pause_event); /* Update [POSTCOPY_]ACTIVE to PRE_SWITCHOVER */ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_PRE_SWITCHOVER); bql_unlock(); - qemu_sem_wait(&s->pause_sem); + qemu_event_wait(&s->pause_event); bql_lock(); /* @@ -4057,7 +4054,7 @@ static void migration_instance_finalize(Object *obj) qemu_mutex_destroy(&ms->qemu_file_lock); qemu_sem_destroy(&ms->wait_unplug_sem); qemu_sem_destroy(&ms->rate_limit_sem); - qemu_sem_destroy(&ms->pause_sem); + qemu_event_destroy(&ms->pause_event); qemu_sem_destroy(&ms->postcopy_pause_sem); qemu_sem_destroy(&ms->rp_state.rp_sem); qemu_sem_destroy(&ms->rp_state.rp_pong_acks); @@ -4072,7 +4069,7 @@ static void migration_instance_init(Object *obj) ms->state = MIGRATION_STATUS_NONE; ms->mbps = -1; ms->pages_per_second = -1; - qemu_sem_init(&ms->pause_sem, 0); + qemu_event_init(&ms->pause_event, false); qemu_mutex_init(&ms->error_mutex); migrate_params_init(&ms->parameters); diff --git a/migration/migration.h b/migration/migration.h index d53f7ca..739289d 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -98,9 +98,9 @@ struct MigrationIncomingState { void (*transport_cleanup)(void *data); /* * Used to sync thread creations. Note that we can't create threads in - * parallel with this sem. + * parallel with this event. */ - QemuSemaphore thread_sync_sem; + QemuEvent thread_sync_event; /* * Free at the start of the main state load, set as the main thread finishes * loading state. @@ -186,7 +186,7 @@ struct MigrationIncomingState { /* The coroutine we should enter (back) after failover */ Coroutine *colo_incoming_co; - QemuSemaphore colo_incoming_sem; + QemuEvent colo_incoming_event; /* Optional load threads pool and its thread exit request flag */ ThreadPool *load_threads; @@ -379,10 +379,10 @@ struct MigrationState { QemuSemaphore wait_unplug_sem; /* Migration is paused due to pause-before-switchover */ - QemuSemaphore pause_sem; + QemuEvent pause_event; - /* The semaphore is used to notify COLO thread that failover is finished */ - QemuSemaphore colo_exit_sem; + /* The event is used to notify COLO thread that failover is finished */ + QemuEvent colo_exit_event; /* The event is used to notify COLO thread to do checkpoint */ QemuEvent colo_checkpoint_event; diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 995614b..75fd310 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -90,10 +90,10 @@ void postcopy_thread_create(MigrationIncomingState *mis, QemuThread *thread, const char *name, void *(*fn)(void *), int joinable) { - qemu_sem_init(&mis->thread_sync_sem, 0); + qemu_event_init(&mis->thread_sync_event, false); qemu_thread_create(thread, name, fn, mis, joinable); - qemu_sem_wait(&mis->thread_sync_sem); - qemu_sem_destroy(&mis->thread_sync_sem); + qemu_event_wait(&mis->thread_sync_event); + qemu_event_destroy(&mis->thread_sync_event); } /* Postcopy needs to detect accesses to pages that haven't yet been copied @@ -964,7 +964,7 @@ static void *postcopy_ram_fault_thread(void *opaque) trace_postcopy_ram_fault_thread_entry(); rcu_register_thread(); mis->last_rb = NULL; /* last RAMBlock we sent part of */ - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); struct pollfd *pfd; size_t pfd_len = 2 + mis->postcopy_remote_fds->len; @@ -1716,7 +1716,7 @@ void *postcopy_preempt_thread(void *opaque) rcu_register_thread(); - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); /* * The preempt channel is established in asynchronous way. Wait diff --git a/migration/savevm.c b/migration/savevm.c index 006514c..52105dd 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2078,7 +2078,7 @@ static void *postcopy_ram_listen_thread(void *opaque) migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_POSTCOPY_ACTIVE); - qemu_sem_post(&mis->thread_sync_sem); + qemu_event_set(&mis->thread_sync_event); trace_postcopy_ram_listen_thread_start(); rcu_register_thread(); diff --git a/python/setup.cfg b/python/setup.cfg index c48dff2..d7f5dc7 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -46,6 +46,7 @@ devel = urwid >= 2.1.2 urwid-readline >= 0.13 Pygments >= 2.9.0 + sphinx >= 3.4.3 # Provides qom-fuse functionality fuse = @@ -78,7 +79,6 @@ exclude = __pycache__, [mypy] strict = True python_version = 3.9 -warn_unused_configs = True namespace_packages = True warn_unused_ignores = False diff --git a/python/tests/minreqs.txt b/python/tests/minreqs.txt index 6445407..cd2e2a8 100644 --- a/python/tests/minreqs.txt +++ b/python/tests/minreqs.txt @@ -11,6 +11,15 @@ # When adding new dependencies, pin the very oldest non-yanked version # on PyPI that allows the test suite to pass. +# For some reason, the presence of packaging==14.0 below requires us to +# also pin setuptools to version 70 or below. Otherwise, the +# installation of the QEMU package itself fails, failing to find +# setuptools. +setuptools<=70 + +# Dependencies for qapidoc/qapi_domain et al +sphinx==3.4.3 + # Dependencies for the TUI addon (Required for successful linting) urwid==2.1.2 urwid-readline==0.13 @@ -38,10 +47,32 @@ pyflakes==2.5.0 # Transitive mypy dependencies mypy-extensions==1.0.0 +tomli==1.1.0 typing-extensions==4.7.1 # Transitive pylint dependencies astroid==2.15.4 +dill==0.2 lazy-object-proxy==1.4.0 +platformdirs==2.2.0 toml==0.10.0 +tomlkit==0.10.1 wrapt==1.14.0 + +# Transitive sphinx dependencies +Jinja2==2.7 +MarkupSafe==1.1.0 +alabaster==0.7.1 +babel==1.3 +docutils==0.12 +imagesize==0.5.0 +packaging==14.0 +pytz==2011b0 +requests==2.5.0 +snowballstemmer==1.1 +sphinxcontrib-applehelp==1.0.0 +sphinxcontrib-devhelp==1.0.0 +sphinxcontrib-htmlhelp==1.0.0 +sphinxcontrib-jsmath==1.0.0 +sphinxcontrib-qthelp==1.0.0 +sphinxcontrib-serializinghtml==1.0.0 diff --git a/python/tests/qapi-flake8.sh b/python/tests/qapi-flake8.sh new file mode 100755 index 0000000..c69f9ea --- /dev/null +++ b/python/tests/qapi-flake8.sh @@ -0,0 +1,6 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0-or-later + +python3 -m flake8 ../scripts/qapi/ \ + ../docs/sphinx/qapidoc.py \ + ../docs/sphinx/qapi_domain.py diff --git a/python/tests/qapi-isort.sh b/python/tests/qapi-isort.sh new file mode 100755 index 0000000..78dd947 --- /dev/null +++ b/python/tests/qapi-isort.sh @@ -0,0 +1,8 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0-or-later + +python3 -m isort --sp . -c ../scripts/qapi/ +# Force isort to recognize "compat" as a local module and not third-party +python3 -m isort --sp . -c -p compat -p qapidoc_legacy \ + ../docs/sphinx/qapi_domain.py \ + ../docs/sphinx/qapidoc.py diff --git a/python/tests/qapi-mypy.sh b/python/tests/qapi-mypy.sh new file mode 100755 index 0000000..363dbaf --- /dev/null +++ b/python/tests/qapi-mypy.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0-or-later + +python3 -m mypy ../scripts/qapi diff --git a/python/tests/qapi-pylint.sh b/python/tests/qapi-pylint.sh new file mode 100755 index 0000000..8767d9d --- /dev/null +++ b/python/tests/qapi-pylint.sh @@ -0,0 +1,8 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0-or-later + +SETUPTOOLS_USE_DISTUTILS=stdlib python3 -m pylint \ + --rcfile=../scripts/qapi/pylintrc \ + ../scripts/qapi/ \ + ../docs/sphinx/qapidoc.py \ + ../docs/sphinx/qapi_domain.py diff --git a/rust/Cargo.lock b/rust/Cargo.lock index bccfe85..b785c71 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -3,6 +3,12 @@ version = 3 [[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + +[[package]] name = "arbitrary-int" version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -45,6 +51,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" [[package]] +name = "foreign" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17ca1b5be8c9d320daf386f1809c7acc0cb09accbae795c2001953fa50585846" +dependencies = [ + "libc", +] + +[[package]] name = "hpet" version = "0.1.0" dependencies = [ @@ -114,6 +129,8 @@ dependencies = [ name = "qemu_api" version = "0.1.0" dependencies = [ + "anyhow", + "foreign", "libc", "qemu_api_macros", ] diff --git a/rust/Cargo.toml b/rust/Cargo.toml index fd4c2fb..0868e1b 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -67,6 +67,7 @@ missing_safety_doc = "deny" mut_mut = "deny" needless_bitwise_bool = "deny" needless_pass_by_ref_mut = "deny" +needless_update = "deny" no_effect_underscore_binding = "deny" option_option = "deny" or_fun_call = "deny" diff --git a/rust/hw/char/pl011/src/device.rs b/rust/hw/char/pl011/src/device.rs index 0501fa5..be8387f 100644 --- a/rust/hw/char/pl011/src/device.rs +++ b/rust/hw/char/pl011/src/device.rs @@ -175,7 +175,7 @@ impl DeviceImpl for PL011State { fn vmsd() -> Option<&'static VMStateDescription> { Some(&device_class::VMSTATE_PL011) } - const REALIZE: Option<fn(&Self)> = Some(Self::realize); + const REALIZE: Option<fn(&Self) -> qemu_api::Result<()>> = Some(Self::realize); } impl ResettablePhasesImpl for PL011State { @@ -619,9 +619,10 @@ impl PL011State { } } - fn realize(&self) { + fn realize(&self) -> qemu_api::Result<()> { self.char_backend .enable_handlers(self, Self::can_receive, Self::receive, Self::event); + Ok(()) } fn reset_hold(&self, _type: ResetType) { diff --git a/rust/hw/timer/hpet/src/device.rs b/rust/hw/timer/hpet/src/device.rs index e3ba62b..735b2fb 100644 --- a/rust/hw/timer/hpet/src/device.rs +++ b/rust/hw/timer/hpet/src/device.rs @@ -12,7 +12,7 @@ use std::{ use qemu_api::{ bindings::{ address_space_memory, address_space_stl_le, qdev_prop_bit, qdev_prop_bool, - qdev_prop_uint32, qdev_prop_uint8, + qdev_prop_uint32, qdev_prop_usize, }, cell::{BqlCell, BqlRefCell}, irq::InterruptSource, @@ -36,9 +36,9 @@ use crate::fw_cfg::HPETFwConfig; const HPET_REG_SPACE_LEN: u64 = 0x400; // 1024 bytes /// Minimum recommended hardware implementation. -const HPET_MIN_TIMERS: u8 = 3; +const HPET_MIN_TIMERS: usize = 3; /// Maximum timers in each timer block. -const HPET_MAX_TIMERS: u8 = 32; +const HPET_MAX_TIMERS: usize = 32; /// Flags that HPETState.flags supports. const HPET_FLAG_MSI_SUPPORT_SHIFT: usize = 0; @@ -561,8 +561,8 @@ pub struct HPETState { /// HPET timer array managed by this timer block. #[doc(alias = "timer")] - timers: [BqlRefCell<HPETTimer>; HPET_MAX_TIMERS as usize], - num_timers: BqlCell<u8>, + timers: [BqlRefCell<HPETTimer>; HPET_MAX_TIMERS], + num_timers: usize, num_timers_save: BqlCell<u8>, /// Instance id (HPET timer block ID). @@ -570,11 +570,6 @@ pub struct HPETState { } impl HPETState { - // Get num_timers with `usize` type, which is useful to play with array index. - fn get_num_timers(&self) -> usize { - self.num_timers.get().into() - } - const fn has_msi_flag(&self) -> bool { self.flags & (1 << HPET_FLAG_MSI_SUPPORT_SHIFT) != 0 } @@ -636,7 +631,7 @@ impl HPETState { self.hpet_offset .set(ticks_to_ns(self.counter.get()) - CLOCK_VIRTUAL.get_ns()); - for timer in self.timers.iter().take(self.get_num_timers()) { + for timer in self.timers.iter().take(self.num_timers) { let mut t = timer.borrow_mut(); if t.is_int_enabled() && t.is_int_active() { @@ -648,7 +643,7 @@ impl HPETState { // Halt main counter and disable interrupt generation. self.counter.set(self.get_ticks()); - for timer in self.timers.iter().take(self.get_num_timers()) { + for timer in self.timers.iter().take(self.num_timers) { timer.borrow_mut().del_timer(); } } @@ -671,7 +666,7 @@ impl HPETState { let new_val = val << shift; let cleared = new_val & self.int_status.get(); - for (index, timer) in self.timers.iter().take(self.get_num_timers()).enumerate() { + for (index, timer) in self.timers.iter().take(self.num_timers).enumerate() { if cleared & (1 << index) != 0 { timer.borrow_mut().update_irq(false); } @@ -724,19 +719,17 @@ impl HPETState { } } - fn realize(&self) { + fn realize(&self) -> qemu_api::Result<()> { + if self.num_timers < HPET_MIN_TIMERS || self.num_timers > HPET_MAX_TIMERS { + Err(format!( + "hpet.num_timers must be between {HPET_MIN_TIMERS} and {HPET_MAX_TIMERS}" + ))?; + } if self.int_route_cap == 0 { - // TODO: Add error binding: warn_report() - println!("Hpet's hpet-intcap property not initialized"); + Err("hpet.hpet-intcap property not initialized")?; } - self.hpet_id.set(HPETFwConfig::assign_hpet_id()); - - if self.num_timers.get() < HPET_MIN_TIMERS { - self.num_timers.set(HPET_MIN_TIMERS); - } else if self.num_timers.get() > HPET_MAX_TIMERS { - self.num_timers.set(HPET_MAX_TIMERS); - } + self.hpet_id.set(HPETFwConfig::assign_hpet_id()?); self.init_timer(); // 64-bit General Capabilities and ID Register; LegacyReplacementRoute. @@ -745,16 +738,17 @@ impl HPETState { 1 << HPET_CAP_COUNT_SIZE_CAP_SHIFT | 1 << HPET_CAP_LEG_RT_CAP_SHIFT | HPET_CAP_VENDER_ID_VALUE << HPET_CAP_VENDER_ID_SHIFT | - ((self.get_num_timers() - 1) as u64) << HPET_CAP_NUM_TIM_SHIFT | // indicate the last timer + ((self.num_timers - 1) as u64) << HPET_CAP_NUM_TIM_SHIFT | // indicate the last timer (HPET_CLK_PERIOD * FS_PER_NS) << HPET_CAP_CNT_CLK_PERIOD_SHIFT, // 10 ns ); self.init_gpio_in(2, HPETState::handle_legacy_irq); self.init_gpio_out(from_ref(&self.pit_enabled)); + Ok(()) } fn reset_hold(&self, _type: ResetType) { - for timer in self.timers.iter().take(self.get_num_timers()) { + for timer in self.timers.iter().take(self.num_timers) { timer.borrow_mut().reset(); } @@ -782,7 +776,7 @@ impl HPETState { GlobalRegister::try_from(addr).map(HPETRegister::Global) } else { let timer_id: usize = ((addr - 0x100) / 0x20) as usize; - if timer_id <= self.get_num_timers() { + if timer_id < self.num_timers { // TODO: Add trace point - trace_hpet_ram_[read|write]_timer_id(timer_id) TimerRegister::try_from(addr & 0x18) .map(|reg| HPETRegister::Timer(&self.timers[timer_id], reg)) @@ -853,12 +847,12 @@ impl HPETState { * also added to the migration stream. Check that it matches the value * that was configured. */ - self.num_timers_save.set(self.num_timers.get()); + self.num_timers_save.set(self.num_timers as u8); 0 } fn post_load(&self, _version_id: u8) -> i32 { - for timer in self.timers.iter().take(self.get_num_timers()) { + for timer in self.timers.iter().take(self.num_timers) { let mut t = timer.borrow_mut(); t.cmp64 = t.calculate_cmp64(t.get_state().counter.get(), t.cmp); @@ -883,7 +877,7 @@ impl HPETState { } fn validate_num_timers(&self, _version_id: u8) -> bool { - self.num_timers.get() == self.num_timers_save.get() + self.num_timers == self.num_timers_save.get().into() } } @@ -910,7 +904,7 @@ qemu_api::declare_properties! { c"timers", HPETState, num_timers, - unsafe { &qdev_prop_uint8 }, + unsafe { &qdev_prop_usize }, u8, default = HPET_MIN_TIMERS ), @@ -1015,16 +1009,16 @@ const VALIDATE_TIMERS_NAME: &CStr = c"num_timers must match"; static VMSTATE_HPET: VMStateDescription = VMStateDescription { name: c"hpet".as_ptr(), version_id: 2, - minimum_version_id: 1, + minimum_version_id: 2, pre_save: Some(hpet_pre_save), post_load: Some(hpet_post_load), fields: vmstate_fields! { vmstate_of!(HPETState, config), vmstate_of!(HPETState, int_status), vmstate_of!(HPETState, counter), - vmstate_of!(HPETState, num_timers_save).with_version_id(2), + vmstate_of!(HPETState, num_timers_save), vmstate_validate!(HPETState, VALIDATE_TIMERS_NAME, HPETState::validate_num_timers), - vmstate_struct!(HPETState, timers[0 .. num_timers], &VMSTATE_HPET_TIMER, BqlRefCell<HPETTimer>, HPETState::validate_num_timers).with_version_id(0), + vmstate_struct!(HPETState, timers[0 .. num_timers_save], &VMSTATE_HPET_TIMER, BqlRefCell<HPETTimer>, HPETState::validate_num_timers).with_version_id(0), }, subsections: vmstate_subsections! { VMSTATE_HPET_RTC_IRQ_LEVEL, @@ -1042,7 +1036,7 @@ impl DeviceImpl for HPETState { Some(&VMSTATE_HPET) } - const REALIZE: Option<fn(&Self)> = Some(Self::realize); + const REALIZE: Option<fn(&Self) -> qemu_api::Result<()>> = Some(Self::realize); } impl ResettablePhasesImpl for HPETState { diff --git a/rust/hw/timer/hpet/src/fw_cfg.rs b/rust/hw/timer/hpet/src/fw_cfg.rs index 6c10316..619d662 100644 --- a/rust/hw/timer/hpet/src/fw_cfg.rs +++ b/rust/hw/timer/hpet/src/fw_cfg.rs @@ -36,7 +36,7 @@ pub static mut hpet_fw_cfg: HPETFwConfig = HPETFwConfig { }; impl HPETFwConfig { - pub(crate) fn assign_hpet_id() -> usize { + pub(crate) fn assign_hpet_id() -> Result<usize, &'static str> { assert!(bql_locked()); // SAFETY: all accesses go through these methods, which guarantee // that the accesses are protected by the BQL. @@ -48,13 +48,12 @@ impl HPETFwConfig { } if fw_cfg.count == 8 { - // TODO: Add error binding: error_setg() - panic!("Only 8 instances of HPET is allowed"); + Err("Only 8 instances of HPET are allowed")?; } let id: usize = fw_cfg.count.into(); fw_cfg.count += 1; - id + Ok(id) } pub(crate) fn update_hpet_cfg(hpet_id: usize, timer_block_id: u32, address: u64) { diff --git a/rust/meson.build b/rust/meson.build index b1b3315..99ae795 100644 --- a/rust/meson.build +++ b/rust/meson.build @@ -1,9 +1,13 @@ +subproject('anyhow-1-rs', required: true) subproject('bilge-0.2-rs', required: true) subproject('bilge-impl-0.2-rs', required: true) +subproject('foreign-0.3-rs', required: true) subproject('libc-0.2-rs', required: true) +anyhow_rs = dependency('anyhow-1-rs') bilge_rs = dependency('bilge-0.2-rs') bilge_impl_rs = dependency('bilge-impl-0.2-rs') +foreign_rs = dependency('foreign-0.3-rs') libc_rs = dependency('libc-0.2-rs') subproject('proc-macro2-1-rs', required: true) diff --git a/rust/qemu-api-macros/src/lib.rs b/rust/qemu-api-macros/src/lib.rs index 1034707..c18bb4e 100644 --- a/rust/qemu-api-macros/src/lib.rs +++ b/rust/qemu-api-macros/src/lib.rs @@ -203,8 +203,8 @@ fn derive_tryinto_body( Ok(quote! { #(const #discriminants: #repr = #name::#discriminants as #repr;)*; match value { - #(#discriminants => Ok(#name::#discriminants),)* - _ => Err(value), + #(#discriminants => core::result::Result::Ok(#name::#discriminants),)* + _ => core::result::Result::Err(value), } }) } @@ -236,7 +236,8 @@ fn derive_tryinto_or_error(input: DeriveInput) -> Result<proc_macro2::TokenStrea impl core::convert::TryFrom<#repr> for #name { type Error = #repr; - fn try_from(value: #repr) -> Result<Self, Self::Error> { + #[allow(ambiguous_associated_items)] + fn try_from(value: #repr) -> Result<Self, #repr> { #body } } diff --git a/rust/qemu-api/Cargo.toml b/rust/qemu-api/Cargo.toml index c96cf50..db7000d 100644 --- a/rust/qemu-api/Cargo.toml +++ b/rust/qemu-api/Cargo.toml @@ -15,7 +15,9 @@ rust-version.workspace = true [dependencies] qemu_api_macros = { path = "../qemu-api-macros" } +anyhow = "~1.0" libc = "0.2.162" +foreign = "~0.3.1" [features] default = ["debug_cell"] diff --git a/rust/qemu-api/meson.build b/rust/qemu-api/meson.build index b532281..cac8595 100644 --- a/rust/qemu-api/meson.build +++ b/rust/qemu-api/meson.build @@ -19,6 +19,7 @@ _qemu_api_rs = static_library( 'src/cell.rs', 'src/chardev.rs', 'src/errno.rs', + 'src/error.rs', 'src/irq.rs', 'src/memory.rs', 'src/module.rs', @@ -35,7 +36,7 @@ _qemu_api_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', rust_args: _qemu_api_cfg, - dependencies: [libc_rs, qemu_api_macros, qemuutil_rs, + dependencies: [anyhow_rs, foreign_rs, libc_rs, qemu_api_macros, qemuutil_rs, qom, hwcore, chardev, migration], ) diff --git a/rust/qemu-api/src/error.rs b/rust/qemu-api/src/error.rs new file mode 100644 index 0000000..e114fc4 --- /dev/null +++ b/rust/qemu-api/src/error.rs @@ -0,0 +1,416 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +//! Error propagation for QEMU Rust code +//! +//! This module contains [`Error`], the bridge between Rust errors and +//! [`Result`](std::result::Result)s and QEMU's C [`Error`](bindings::Error) +//! struct. +//! +//! For FFI code, [`Error`] provides functions to simplify conversion between +//! the Rust ([`Result<>`](std::result::Result)) and C (`Error**`) conventions: +//! +//! * [`ok_or_propagate`](crate::Error::ok_or_propagate), +//! [`bool_or_propagate`](crate::Error::bool_or_propagate), +//! [`ptr_or_propagate`](crate::Error::ptr_or_propagate) can be used to build +//! a C return value while also propagating an error condition +//! +//! * [`err_or_else`](crate::Error::err_or_else) and +//! [`err_or_unit`](crate::Error::err_or_unit) can be used to build a `Result` +//! +//! This module is most commonly used at the boundary between C and Rust code; +//! other code will usually access it through the +//! [`qemu_api::Result`](crate::Result) type alias, and will use the +//! [`std::error::Error`] interface to let C errors participate in Rust's error +//! handling functionality. +//! +//! Rust code can also create use this module to create an error object that +//! will be passed up to C code, though in most cases this will be done +//! transparently through the `?` operator. Errors can be constructed from a +//! simple error string, from an [`anyhow::Error`] to pass any other Rust error +//! type up to C code, or from a combination of the two. +//! +//! The third case, corresponding to [`Error::with_error`], is the only one that +//! requires mentioning [`qemu_api::Error`](crate::Error) explicitly. Similar +//! to how QEMU's C code handles errno values, the string and the +//! `anyhow::Error` object will be concatenated with `:` as the separator. + +use std::{ + borrow::Cow, + ffi::{c_char, c_int, c_void, CStr}, + fmt::{self, Display}, + panic, ptr, +}; + +use foreign::{prelude::*, OwnedPointer}; + +use crate::bindings; + +pub type Result<T> = std::result::Result<T, Error>; + +#[derive(Debug)] +pub struct Error { + msg: Option<Cow<'static, str>>, + /// Appends the print string of the error to the msg if not None + cause: Option<anyhow::Error>, + file: &'static str, + line: u32, +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.cause.as_ref().map(AsRef::as_ref) + } + + #[allow(deprecated)] + fn description(&self) -> &str { + self.msg + .as_deref() + .or_else(|| self.cause.as_deref().map(std::error::Error::description)) + .expect("no message nor cause?") + } +} + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut prefix = ""; + if let Some(ref msg) = self.msg { + write!(f, "{msg}")?; + prefix = ": "; + } + if let Some(ref cause) = self.cause { + write!(f, "{prefix}{cause}")?; + } else if prefix.is_empty() { + panic!("no message nor cause?"); + } + Ok(()) + } +} + +impl From<String> for Error { + #[track_caller] + fn from(msg: String) -> Self { + let location = panic::Location::caller(); + Error { + msg: Some(Cow::Owned(msg)), + cause: None, + file: location.file(), + line: location.line(), + } + } +} + +impl From<&'static str> for Error { + #[track_caller] + fn from(msg: &'static str) -> Self { + let location = panic::Location::caller(); + Error { + msg: Some(Cow::Borrowed(msg)), + cause: None, + file: location.file(), + line: location.line(), + } + } +} + +impl From<anyhow::Error> for Error { + #[track_caller] + fn from(error: anyhow::Error) -> Self { + let location = panic::Location::caller(); + Error { + msg: None, + cause: Some(error), + file: location.file(), + line: location.line(), + } + } +} + +impl Error { + /// Create a new error, prepending `msg` to the + /// description of `cause` + #[track_caller] + pub fn with_error(msg: impl Into<Cow<'static, str>>, cause: impl Into<anyhow::Error>) -> Self { + let location = panic::Location::caller(); + Error { + msg: Some(msg.into()), + cause: Some(cause.into()), + file: location.file(), + line: location.line(), + } + } + + /// Consume a result, returning `false` if it is an error and + /// `true` if it is successful. The error is propagated into + /// `errp` like the C API `error_propagate` would do. + /// + /// # Safety + /// + /// `errp` must be a valid argument to `error_propagate`; + /// typically it is received from C code and need not be + /// checked further at the Rust↔C boundary. + pub unsafe fn bool_or_propagate(result: Result<()>, errp: *mut *mut bindings::Error) -> bool { + // SAFETY: caller guarantees errp is valid + unsafe { Self::ok_or_propagate(result, errp) }.is_some() + } + + /// Consume a result, returning a `NULL` pointer if it is an error and + /// a C representation of the contents if it is successful. This is + /// similar to the C API `error_propagate`, but it panics if `*errp` + /// is not `NULL`. + /// + /// # Safety + /// + /// `errp` must be a valid argument to `error_propagate`; + /// typically it is received from C code and need not be + /// checked further at the Rust↔C boundary. + /// + /// See [`propagate`](Error::propagate) for more information. + #[must_use] + pub unsafe fn ptr_or_propagate<T: CloneToForeign>( + result: Result<T>, + errp: *mut *mut bindings::Error, + ) -> *mut T::Foreign { + // SAFETY: caller guarantees errp is valid + unsafe { Self::ok_or_propagate(result, errp) }.clone_to_foreign_ptr() + } + + /// Consume a result in the same way as `self.ok()`, but also propagate + /// a possible error into `errp`. This is similar to the C API + /// `error_propagate`, but it panics if `*errp` is not `NULL`. + /// + /// # Safety + /// + /// `errp` must be a valid argument to `error_propagate`; + /// typically it is received from C code and need not be + /// checked further at the Rust↔C boundary. + /// + /// See [`propagate`](Error::propagate) for more information. + pub unsafe fn ok_or_propagate<T>( + result: Result<T>, + errp: *mut *mut bindings::Error, + ) -> Option<T> { + result.map_err(|err| unsafe { err.propagate(errp) }).ok() + } + + /// Equivalent of the C function `error_propagate`. Fill `*errp` + /// with the information container in `self` if `errp` is not NULL; + /// then consume it. + /// + /// This is similar to the C API `error_propagate`, but it panics if + /// `*errp` is not `NULL`. + /// + /// # Safety + /// + /// `errp` must be a valid argument to `error_propagate`; it can be + /// `NULL` or it can point to any of: + /// * `error_abort` + /// * `error_fatal` + /// * a local variable of (C) type `Error *` + /// + /// Typically `errp` is received from C code and need not be + /// checked further at the Rust↔C boundary. + pub unsafe fn propagate(self, errp: *mut *mut bindings::Error) { + if errp.is_null() { + return; + } + + // SAFETY: caller guarantees that errp and *errp are valid + unsafe { + assert_eq!(*errp, ptr::null_mut()); + bindings::error_propagate(errp, self.clone_to_foreign_ptr()); + } + } + + /// Convert a C `Error*` into a Rust `Result`, using + /// `Ok(())` if `c_error` is NULL. Free the `Error*`. + /// + /// # Safety + /// + /// `c_error` must be `NULL` or valid; typically it was initialized + /// with `ptr::null_mut()` and passed by reference to a C function. + pub unsafe fn err_or_unit(c_error: *mut bindings::Error) -> Result<()> { + // SAFETY: caller guarantees c_error is valid + unsafe { Self::err_or_else(c_error, || ()) } + } + + /// Convert a C `Error*` into a Rust `Result`, calling `f()` to + /// obtain an `Ok` value if `c_error` is NULL. Free the `Error*`. + /// + /// # Safety + /// + /// `c_error` must be `NULL` or point to a valid C [`struct + /// Error`](bindings::Error); typically it was initialized with + /// `ptr::null_mut()` and passed by reference to a C function. + pub unsafe fn err_or_else<T, F: FnOnce() -> T>( + c_error: *mut bindings::Error, + f: F, + ) -> Result<T> { + // SAFETY: caller guarantees c_error is valid + let err = unsafe { Option::<Self>::from_foreign(c_error) }; + match err { + None => Ok(f()), + Some(err) => Err(err), + } + } +} + +impl FreeForeign for Error { + type Foreign = bindings::Error; + + unsafe fn free_foreign(p: *mut bindings::Error) { + // SAFETY: caller guarantees p is valid + unsafe { + bindings::error_free(p); + } + } +} + +impl CloneToForeign for Error { + fn clone_to_foreign(&self) -> OwnedPointer<Self> { + // SAFETY: all arguments are controlled by this function + unsafe { + let err: *mut c_void = libc::malloc(std::mem::size_of::<bindings::Error>()); + let err: &mut bindings::Error = &mut *err.cast(); + *err = bindings::Error { + msg: format!("{self}").clone_to_foreign_ptr(), + err_class: bindings::ERROR_CLASS_GENERIC_ERROR, + src_len: self.file.len() as c_int, + src: self.file.as_ptr().cast::<c_char>(), + line: self.line as c_int, + func: ptr::null_mut(), + hint: ptr::null_mut(), + }; + OwnedPointer::new(err) + } + } +} + +impl FromForeign for Error { + unsafe fn cloned_from_foreign(c_error: *const bindings::Error) -> Self { + // SAFETY: caller guarantees c_error is valid + unsafe { + let error = &*c_error; + let file = if error.src_len < 0 { + // NUL-terminated + CStr::from_ptr(error.src).to_str() + } else { + // Can become str::from_utf8 with Rust 1.87.0 + std::str::from_utf8(std::slice::from_raw_parts( + &*error.src.cast::<u8>(), + error.src_len as usize, + )) + }; + + Error { + msg: FromForeign::cloned_from_foreign(error.msg), + cause: None, + file: file.unwrap(), + line: error.line as u32, + } + } + } +} + +#[cfg(test)] +mod tests { + use std::ffi::CStr; + + use anyhow::anyhow; + use foreign::OwnedPointer; + + use super::*; + use crate::{assert_match, bindings}; + + #[track_caller] + fn error_for_test(msg: &CStr) -> OwnedPointer<Error> { + // SAFETY: all arguments are controlled by this function + let location = panic::Location::caller(); + unsafe { + let err: *mut c_void = libc::malloc(std::mem::size_of::<bindings::Error>()); + let err: &mut bindings::Error = &mut *err.cast(); + *err = bindings::Error { + msg: msg.clone_to_foreign_ptr(), + err_class: bindings::ERROR_CLASS_GENERIC_ERROR, + src_len: location.file().len() as c_int, + src: location.file().as_ptr().cast::<c_char>(), + line: location.line() as c_int, + func: ptr::null_mut(), + hint: ptr::null_mut(), + }; + OwnedPointer::new(err) + } + } + + unsafe fn error_get_pretty<'a>(local_err: *mut bindings::Error) -> &'a CStr { + unsafe { CStr::from_ptr(bindings::error_get_pretty(local_err)) } + } + + #[test] + #[allow(deprecated)] + fn test_description() { + use std::error::Error; + + assert_eq!(super::Error::from("msg").description(), "msg"); + assert_eq!(super::Error::from("msg".to_owned()).description(), "msg"); + } + + #[test] + fn test_display() { + assert_eq!(&*format!("{}", Error::from("msg")), "msg"); + assert_eq!(&*format!("{}", Error::from("msg".to_owned())), "msg"); + assert_eq!(&*format!("{}", Error::from(anyhow!("msg"))), "msg"); + + assert_eq!( + &*format!("{}", Error::with_error("msg", anyhow!("cause"))), + "msg: cause" + ); + } + + #[test] + fn test_bool_or_propagate() { + unsafe { + let mut local_err: *mut bindings::Error = ptr::null_mut(); + + assert!(Error::bool_or_propagate(Ok(()), &mut local_err)); + assert_eq!(local_err, ptr::null_mut()); + + let my_err = Error::from("msg"); + assert!(!Error::bool_or_propagate(Err(my_err), &mut local_err)); + assert_ne!(local_err, ptr::null_mut()); + assert_eq!(error_get_pretty(local_err), c"msg"); + bindings::error_free(local_err); + } + } + + #[test] + fn test_ptr_or_propagate() { + unsafe { + let mut local_err: *mut bindings::Error = ptr::null_mut(); + + let ret = Error::ptr_or_propagate(Ok("abc".to_owned()), &mut local_err); + assert_eq!(String::from_foreign(ret), "abc"); + assert_eq!(local_err, ptr::null_mut()); + + let my_err = Error::from("msg"); + assert_eq!( + Error::ptr_or_propagate(Err::<String, _>(my_err), &mut local_err), + ptr::null_mut() + ); + assert_ne!(local_err, ptr::null_mut()); + assert_eq!(error_get_pretty(local_err), c"msg"); + bindings::error_free(local_err); + } + } + + #[test] + fn test_err_or_unit() { + unsafe { + let result = Error::err_or_unit(ptr::null_mut()); + assert_match!(result, Ok(())); + + let err = error_for_test(c"msg"); + let err = Error::err_or_unit(err.into_inner()).unwrap_err(); + assert_eq!(&*format!("{err}"), "msg"); + } + } +} diff --git a/rust/qemu-api/src/lib.rs b/rust/qemu-api/src/lib.rs index 234a94e..93902fc 100644 --- a/rust/qemu-api/src/lib.rs +++ b/rust/qemu-api/src/lib.rs @@ -19,6 +19,7 @@ pub mod callbacks; pub mod cell; pub mod chardev; pub mod errno; +pub mod error; pub mod irq; pub mod memory; pub mod module; @@ -34,6 +35,8 @@ use std::{ ffi::c_void, }; +pub use error::{Error, Result}; + #[cfg(HAVE_GLIB_WITH_ALIGNED_ALLOC)] extern "C" { fn g_aligned_alloc0( diff --git a/rust/qemu-api/src/qdev.rs b/rust/qemu-api/src/qdev.rs index 1279d7a..0610959 100644 --- a/rust/qemu-api/src/qdev.rs +++ b/rust/qemu-api/src/qdev.rs @@ -12,10 +12,11 @@ use std::{ pub use bindings::{ClockEvent, DeviceClass, Property, ResetType}; use crate::{ - bindings::{self, qdev_init_gpio_in, qdev_init_gpio_out, Error, ResettableClass}, + bindings::{self, qdev_init_gpio_in, qdev_init_gpio_out, ResettableClass}, callbacks::FnCall, cell::{bql_locked, Opaque}, chardev::Chardev, + error::{Error, Result}, irq::InterruptSource, prelude::*, qom::{ObjectClass, ObjectImpl, Owned}, @@ -108,7 +109,7 @@ pub trait DeviceImpl: ObjectImpl + ResettablePhasesImpl + IsA<DeviceState> { /// /// If not `None`, the parent class's `realize` method is overridden /// with the function pointed to by `REALIZE`. - const REALIZE: Option<fn(&Self)> = None; + const REALIZE: Option<fn(&Self) -> Result<()>> = None; /// An array providing the properties that the user can set on the /// device. Not a `const` because referencing statics in constants @@ -134,10 +135,13 @@ pub trait DeviceImpl: ObjectImpl + ResettablePhasesImpl + IsA<DeviceState> { /// readable/writeable from one thread at any time. unsafe extern "C" fn rust_realize_fn<T: DeviceImpl>( dev: *mut bindings::DeviceState, - _errp: *mut *mut Error, + errp: *mut *mut bindings::Error, ) { let state = NonNull::new(dev).unwrap().cast::<T>(); - T::REALIZE.unwrap()(unsafe { state.as_ref() }); + let result = T::REALIZE.unwrap()(unsafe { state.as_ref() }); + unsafe { + Error::ok_or_propagate(result, errp); + } } unsafe impl InterfaceType for ResettableClass { diff --git a/rust/wrapper.h b/rust/wrapper.h index beddd9a..6060d3b 100644 --- a/rust/wrapper.h +++ b/rust/wrapper.h @@ -60,6 +60,7 @@ typedef enum memory_order { #include "hw/qdev-properties-system.h" #include "hw/irq.h" #include "qapi/error.h" +#include "qapi/error-internal.h" #include "migration/vmstate.h" #include "chardev/char-serial.h" #include "exec/memattrs.h" diff --git a/scripts/archive-source.sh b/scripts/archive-source.sh index e461c15..035828c 100755 --- a/scripts/archive-source.sh +++ b/scripts/archive-source.sh @@ -27,8 +27,9 @@ sub_file="${sub_tdir}/submodule.tar" # in their checkout, because the build environment is completely # different to the host OS. subprojects="keycodemapdb libvfio-user berkeley-softfloat-3 - berkeley-testfloat-3 arbitrary-int-1-rs bilge-0.2-rs - bilge-impl-0.2-rs either-1-rs itertools-0.11-rs libc-0.2-rs proc-macro2-1-rs + berkeley-testfloat-3 anyhow-1-rs arbitrary-int-1-rs bilge-0.2-rs + bilge-impl-0.2-rs either-1-rs foreign-0.3-rs itertools-0.11-rs + libc-0.2-rs proc-macro2-1-rs proc-macro-error-1-rs proc-macro-error-attr-1-rs quote-1-rs syn-2-rs unicode-ident-1-rs" sub_deinit="" diff --git a/scripts/make-release b/scripts/make-release index 8c3594a..4509a9f 100755 --- a/scripts/make-release +++ b/scripts/make-release @@ -40,8 +40,9 @@ fi # Only include wraps that are invoked with subproject() SUBPROJECTS="libvfio-user keycodemapdb berkeley-softfloat-3 - berkeley-testfloat-3 arbitrary-int-1-rs bilge-0.2-rs - bilge-impl-0.2-rs either-1-rs itertools-0.11-rs libc-0.2-rs proc-macro2-1-rs + berkeley-testfloat-3 anyhow-1-rs arbitrary-int-1-rs bilge-0.2-rs + bilge-impl-0.2-rs either-1-rs foreign-0.3-rs itertools-0.11-rs + libc-0.2-rs proc-macro2-1-rs proc-macro-error-1-rs proc-macro-error-attr-1-rs quote-1-rs syn-2-rs unicode-ident-1-rs" diff --git a/scripts/qapi/.flake8 b/scripts/qapi/.flake8 deleted file mode 100644 index a873ff6..0000000 --- a/scripts/qapi/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -# Prefer pylint's bare-except checks to flake8's -extend-ignore = E722 diff --git a/scripts/qapi/.isort.cfg b/scripts/qapi/.isort.cfg deleted file mode 100644 index 643caa1..0000000 --- a/scripts/qapi/.isort.cfg +++ /dev/null @@ -1,7 +0,0 @@ -[settings] -force_grid_wrap=4 -force_sort_within_sections=True -include_trailing_comma=True -line_length=72 -lines_after_imports=2 -multi_line_output=3 diff --git a/scripts/qapi/backend.py b/scripts/qapi/backend.py index 14e60aa6..49ae6ec 100644 --- a/scripts/qapi/backend.py +++ b/scripts/qapi/backend.py @@ -13,6 +13,7 @@ from .visit import gen_visit class QAPIBackend(ABC): + # pylint: disable=too-few-public-methods @abstractmethod def generate(self, @@ -36,6 +37,7 @@ class QAPIBackend(ABC): class QAPICBackend(QAPIBackend): + # pylint: disable=too-few-public-methods def generate(self, schema: QAPISchema, diff --git a/scripts/qapi/mypy.ini b/scripts/qapi/mypy.ini deleted file mode 100644 index c9dbcec..0000000 --- a/scripts/qapi/mypy.ini +++ /dev/null @@ -1,4 +0,0 @@ -[mypy] -strict = True -disallow_untyped_calls = False -python_version = 3.9 diff --git a/scripts/qapi/pylintrc b/scripts/qapi/pylintrc index d24eece..e16283a 100644 --- a/scripts/qapi/pylintrc +++ b/scripts/qapi/pylintrc @@ -19,6 +19,7 @@ disable=consider-using-f-string, too-many-instance-attributes, too-many-positional-arguments, too-many-statements, + unknown-option-value, useless-option-value, [REPORTS] diff --git a/subprojects/.gitignore b/subprojects/.gitignore index d12d346..f428193 100644 --- a/subprojects/.gitignore +++ b/subprojects/.gitignore @@ -6,10 +6,12 @@ /keycodemapdb /libvfio-user /slirp +/anyhow-1.0.98 /arbitrary-int-1.2.7 /bilge-0.2.0 /bilge-impl-0.2.0 /either-1.12.0 +/foreign-0.3.1 /itertools-0.11.0 /libc-0.2.162 /proc-macro-error-1.0.4 diff --git a/subprojects/anyhow-1-rs.wrap b/subprojects/anyhow-1-rs.wrap new file mode 100644 index 0000000..a69a364 --- /dev/null +++ b/subprojects/anyhow-1-rs.wrap @@ -0,0 +1,7 @@ +[wrap-file] +directory = anyhow-1.0.98 +source_url = https://crates.io/api/v1/crates/anyhow/1.0.98/download +source_filename = anyhow-1.0.98.tar.gz +source_hash = e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487 +#method = cargo +patch_directory = anyhow-1-rs diff --git a/subprojects/foreign-0.3-rs.wrap b/subprojects/foreign-0.3-rs.wrap new file mode 100644 index 0000000..0d218ec --- /dev/null +++ b/subprojects/foreign-0.3-rs.wrap @@ -0,0 +1,7 @@ +[wrap-file] +directory = foreign-0.3.1 +source_url = https://crates.io/api/v1/crates/foreign/0.3.1/download +source_filename = foreign-0.3.1.tar.gz +source_hash = 17ca1b5be8c9d320daf386f1809c7acc0cb09accbae795c2001953fa50585846 +#method = cargo +patch_directory = foreign-0.3-rs diff --git a/subprojects/packagefiles/anyhow-1-rs/meson.build b/subprojects/packagefiles/anyhow-1-rs/meson.build new file mode 100644 index 0000000..348bab9 --- /dev/null +++ b/subprojects/packagefiles/anyhow-1-rs/meson.build @@ -0,0 +1,33 @@ +project('anyhow-1-rs', 'rust', + meson_version: '>=1.5.0', + version: '1.0.98', + license: 'MIT OR Apache-2.0', + default_options: []) + +rustc = meson.get_compiler('rust') + +rust_args = ['--cap-lints', 'allow'] +rust_args += ['--cfg', 'feature="std"'] +if rustc.version().version_compare('<1.65.0') + error('rustc version ' + rustc.version() + ' is unsupported. Please upgrade to at least 1.65.0') +endif +rust_args += [ '--cfg', 'std_backtrace' ] # >= 1.65.0 +if rustc.version().version_compare('<1.81.0') + rust_args += [ '--cfg', 'anyhow_no_core_error' ] +endif + +_anyhow_rs = static_library( + 'anyhow', + files('src/lib.rs'), + gnu_symbol_visibility: 'hidden', + override_options: ['rust_std=2018', 'build.rust_std=2018'], + rust_abi: 'rust', + rust_args: rust_args, + dependencies: [], +) + +anyhow_dep = declare_dependency( + link_with: _anyhow_rs, +) + +meson.override_dependency('anyhow-1-rs', anyhow_dep) diff --git a/subprojects/packagefiles/foreign-0.3-rs/meson.build b/subprojects/packagefiles/foreign-0.3-rs/meson.build new file mode 100644 index 0000000..0901c02 --- /dev/null +++ b/subprojects/packagefiles/foreign-0.3-rs/meson.build @@ -0,0 +1,26 @@ +project('foreign-0.3-rs', 'rust', + meson_version: '>=1.5.0', + version: '0.2.0', + license: 'MIT OR Apache-2.0', + default_options: []) + +subproject('libc-0.2-rs', required: true) +libc_rs = dependency('libc-0.2-rs') + +_foreign_rs = static_library( + 'foreign', + files('src/lib.rs'), + gnu_symbol_visibility: 'hidden', + override_options: ['rust_std=2021', 'build.rust_std=2021'], + rust_abi: 'rust', + rust_args: [ + '--cap-lints', 'allow', + ], + dependencies: [libc_rs], +) + +foreign_dep = declare_dependency( + link_with: _foreign_rs, +) + +meson.override_dependency('foreign-0.3-rs', foreign_dep) diff --git a/system/memory.c b/system/memory.c index 63b983e..306e9ff 100644 --- a/system/memory.c +++ b/system/memory.c @@ -2174,18 +2174,14 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, } /* Called with rcu_read_lock held. */ -bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - bool *mr_has_discard_manager, Error **errp) +MemoryRegion *memory_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp) { MemoryRegion *mr; hwaddr xlat; hwaddr len = iotlb->addr_mask + 1; bool writable = iotlb->perm & IOMMU_WO; - if (mr_has_discard_manager) { - *mr_has_discard_manager = false; - } /* * The IOMMU TLB entry we have just covers translation through * this IOMMU to its immediate target. We need to translate @@ -2195,7 +2191,7 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, &xlat, &len, writable, MEMTXATTRS_UNSPECIFIED); if (!memory_region_is_ram(mr)) { error_setg(errp, "iommu map to non memory area %" HWADDR_PRIx "", xlat); - return false; + return NULL; } else if (memory_region_has_ram_discard_manager(mr)) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr); MemoryRegionSection tmp = { @@ -2203,9 +2199,6 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, .offset_within_region = xlat, .size = int128_make64(len), }; - if (mr_has_discard_manager) { - *mr_has_discard_manager = true; - } /* * Malicious VMs can map memory into the IOMMU, which is expected * to remain discarded. vfio will pin all pages, populating memory. @@ -2216,7 +2209,7 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, error_setg(errp, "iommu map to discarded memory (e.g., unplugged" " via virtio-mem): %" HWADDR_PRIx "", iotlb->translated_addr); - return false; + return NULL; } } @@ -2226,22 +2219,11 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, */ if (len & iotlb->addr_mask) { error_setg(errp, "iommu has granularity incompatible with target AS"); - return false; - } - - if (vaddr) { - *vaddr = memory_region_get_ram_ptr(mr) + xlat; - } - - if (ram_addr) { - *ram_addr = memory_region_get_ram_addr(mr) + xlat; - } - - if (read_only) { - *read_only = !writable || mr->readonly; + return NULL; } - return true; + *xlat_p = xlat; + return mr; } void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index a6bc089..56a6b9b 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -6018,9 +6018,11 @@ static bool host_supports_vmx(void) * because private/shared page tracking is already provided through other * means, these 2 use-cases should be treated as being mutually-exclusive. */ -static int kvm_handle_hc_map_gpa_range(struct kvm_run *run) +static int kvm_handle_hc_map_gpa_range(X86CPU *cpu, struct kvm_run *run) { + struct kvm_pre_fault_memory mem; uint64_t gpa, size, attributes; + int ret; if (!machine_require_guest_memfd(current_machine)) return -EINVAL; @@ -6031,13 +6033,32 @@ static int kvm_handle_hc_map_gpa_range(struct kvm_run *run) trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags); - return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED); + ret = kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED); + if (ret || !kvm_pre_fault_memory_supported) { + return ret; + } + + /* + * Opportunistically pre-fault memory in. Failures are ignored so that any + * errors in faulting in the memory will get captured in KVM page fault + * path when the guest first accesses the page. + */ + memset(&mem, 0, sizeof(mem)); + mem.gpa = gpa; + mem.size = size; + while (mem.size) { + if (kvm_vcpu_ioctl(CPU(cpu), KVM_PRE_FAULT_MEMORY, &mem)) { + break; + } + } + + return 0; } -static int kvm_handle_hypercall(struct kvm_run *run) +static int kvm_handle_hypercall(X86CPU *cpu, struct kvm_run *run) { if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE) - return kvm_handle_hc_map_gpa_range(run); + return kvm_handle_hc_map_gpa_range(cpu, run); return -EINVAL; } @@ -6137,7 +6158,7 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) break; #endif case KVM_EXIT_HYPERCALL: - ret = kvm_handle_hypercall(run); + ret = kvm_handle_hypercall(cpu, run); break; case KVM_EXIT_SYSTEM_EVENT: switch (run->system_event.type) { diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index f0aa189..a2e4d48 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -842,7 +842,7 @@ int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s) void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { - uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int old_flags = get_float_exception_flags(&env->sse_status); int i; for (i = 0; i < 2 << SHIFT; i++) { d->ZMM_S(i) = float32_div(float32_one, @@ -855,7 +855,7 @@ void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) #if SHIFT == 1 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s) { - uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int old_flags = get_float_exception_flags(&env->sse_status); int i; d->ZMM_S(0) = float32_div(float32_one, float32_sqrt(s->ZMM_S(0), &env->sse_status), @@ -869,7 +869,7 @@ void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s) void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { - uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int old_flags = get_float_exception_flags(&env->sse_status); int i; for (i = 0; i < 2 << SHIFT; i++) { d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status); @@ -880,7 +880,7 @@ void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) #if SHIFT == 1 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s) { - uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int old_flags = get_float_exception_flags(&env->sse_status); int i; d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status); for (i = 1; i < 2 << SHIFT; i++) { @@ -1714,7 +1714,7 @@ void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mode) { - uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int old_flags = get_float_exception_flags(&env->sse_status); signed char prev_rounding_mode; int i; @@ -1738,7 +1738,7 @@ void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mode) { - uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int old_flags = get_float_exception_flags(&env->sse_status); signed char prev_rounding_mode; int i; @@ -1763,7 +1763,7 @@ void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t mode) { - uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int old_flags = get_float_exception_flags(&env->sse_status); signed char prev_rounding_mode; int i; @@ -1788,7 +1788,7 @@ void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t mode) { - uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int old_flags = get_float_exception_flags(&env->sse_status); signed char prev_rounding_mode; int i; diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c index 1cbadb1..b3b2382 100644 --- a/target/i386/tcg/fpu_helper.c +++ b/target/i386/tcg/fpu_helper.c @@ -189,25 +189,25 @@ void cpu_init_fp_statuses(CPUX86State *env) set_float_default_nan_pattern(0b11000000, &env->mmx_status); set_float_default_nan_pattern(0b11000000, &env->sse_status); /* - * TODO: x86 does flush-to-zero detection after rounding (the SDM + * x86 does flush-to-zero detection after rounding (the SDM * section 10.2.3.3 on the FTZ bit of MXCSR says that we flush * when we detect underflow, which x86 does after rounding). */ - set_float_ftz_detection(float_ftz_before_rounding, &env->fp_status); - set_float_ftz_detection(float_ftz_before_rounding, &env->mmx_status); - set_float_ftz_detection(float_ftz_before_rounding, &env->sse_status); + set_float_ftz_detection(float_ftz_after_rounding, &env->fp_status); + set_float_ftz_detection(float_ftz_after_rounding, &env->mmx_status); + set_float_ftz_detection(float_ftz_after_rounding, &env->sse_status); } -static inline uint8_t save_exception_flags(CPUX86State *env) +static inline int save_exception_flags(CPUX86State *env) { - uint8_t old_flags = get_float_exception_flags(&env->fp_status); + int old_flags = get_float_exception_flags(&env->fp_status); set_float_exception_flags(0, &env->fp_status); return old_flags; } -static void merge_exception_flags(CPUX86State *env, uint8_t old_flags) +static void merge_exception_flags(CPUX86State *env, int old_flags) { - uint8_t new_flags = get_float_exception_flags(&env->fp_status); + int new_flags = get_float_exception_flags(&env->fp_status); float_raise(old_flags, &env->fp_status); fpu_set_exception(env, ((new_flags & float_flag_invalid ? FPUS_IE : 0) | @@ -215,12 +215,12 @@ static void merge_exception_flags(CPUX86State *env, uint8_t old_flags) (new_flags & float_flag_overflow ? FPUS_OE : 0) | (new_flags & float_flag_underflow ? FPUS_UE : 0) | (new_flags & float_flag_inexact ? FPUS_PE : 0) | - (new_flags & float_flag_input_denormal_flushed ? FPUS_DE : 0))); + (new_flags & float_flag_input_denormal_used ? FPUS_DE : 0))); } static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); floatx80 ret = floatx80_div(a, b, &env->fp_status); merge_exception_flags(env, old_flags); return ret; @@ -240,7 +240,7 @@ static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr) void helper_flds_FT0(CPUX86State *env, uint32_t val) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); union { float32 f; uint32_t i; @@ -253,7 +253,7 @@ void helper_flds_FT0(CPUX86State *env, uint32_t val) void helper_fldl_FT0(CPUX86State *env, uint64_t val) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); union { float64 f; uint64_t i; @@ -271,7 +271,7 @@ void helper_fildl_FT0(CPUX86State *env, int32_t val) void helper_flds_ST0(CPUX86State *env, uint32_t val) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int new_fpstt; union { float32 f; @@ -288,7 +288,7 @@ void helper_flds_ST0(CPUX86State *env, uint32_t val) void helper_fldl_ST0(CPUX86State *env, uint64_t val) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int new_fpstt; union { float64 f; @@ -338,7 +338,7 @@ void helper_fildll_ST0(CPUX86State *env, int64_t val) uint32_t helper_fsts_ST0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); union { float32 f; uint32_t i; @@ -351,7 +351,7 @@ uint32_t helper_fsts_ST0(CPUX86State *env) uint64_t helper_fstl_ST0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); union { float64 f; uint64_t i; @@ -364,7 +364,7 @@ uint64_t helper_fstl_ST0(CPUX86State *env) int32_t helper_fist_ST0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int32_t val; val = floatx80_to_int32(ST0, &env->fp_status); @@ -378,7 +378,7 @@ int32_t helper_fist_ST0(CPUX86State *env) int32_t helper_fistl_ST0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int32_t val; val = floatx80_to_int32(ST0, &env->fp_status); @@ -391,7 +391,7 @@ int32_t helper_fistl_ST0(CPUX86State *env) int64_t helper_fistll_ST0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int64_t val; val = floatx80_to_int64(ST0, &env->fp_status); @@ -404,7 +404,7 @@ int64_t helper_fistll_ST0(CPUX86State *env) int32_t helper_fistt_ST0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int32_t val; val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status); @@ -418,7 +418,7 @@ int32_t helper_fistt_ST0(CPUX86State *env) int32_t helper_fisttl_ST0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int32_t val; val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status); @@ -431,7 +431,7 @@ int32_t helper_fisttl_ST0(CPUX86State *env) int64_t helper_fisttll_ST0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int64_t val; val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status); @@ -527,7 +527,7 @@ static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500}; void helper_fcom_ST0_FT0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); FloatRelation ret; ret = floatx80_compare(ST0, FT0, &env->fp_status); @@ -537,7 +537,7 @@ void helper_fcom_ST0_FT0(CPUX86State *env) void helper_fucom_ST0_FT0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); FloatRelation ret; ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status); @@ -549,7 +549,7 @@ static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C}; void helper_fcomi_ST0_FT0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int eflags; FloatRelation ret; @@ -562,7 +562,7 @@ void helper_fcomi_ST0_FT0(CPUX86State *env) void helper_fucomi_ST0_FT0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int eflags; FloatRelation ret; @@ -575,28 +575,28 @@ void helper_fucomi_ST0_FT0(CPUX86State *env) void helper_fadd_ST0_FT0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); ST0 = floatx80_add(ST0, FT0, &env->fp_status); merge_exception_flags(env, old_flags); } void helper_fmul_ST0_FT0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); ST0 = floatx80_mul(ST0, FT0, &env->fp_status); merge_exception_flags(env, old_flags); } void helper_fsub_ST0_FT0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); ST0 = floatx80_sub(ST0, FT0, &env->fp_status); merge_exception_flags(env, old_flags); } void helper_fsubr_ST0_FT0(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); ST0 = floatx80_sub(FT0, ST0, &env->fp_status); merge_exception_flags(env, old_flags); } @@ -615,28 +615,28 @@ void helper_fdivr_ST0_FT0(CPUX86State *env) void helper_fadd_STN_ST0(CPUX86State *env, int st_index) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status); merge_exception_flags(env, old_flags); } void helper_fmul_STN_ST0(CPUX86State *env, int st_index) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status); merge_exception_flags(env, old_flags); } void helper_fsub_STN_ST0(CPUX86State *env, int st_index) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status); merge_exception_flags(env, old_flags); } void helper_fsubr_STN_ST0(CPUX86State *env, int st_index) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status); merge_exception_flags(env, old_flags); } @@ -861,7 +861,7 @@ void helper_fbld_ST0(CPUX86State *env, target_ulong ptr) void helper_fbst_ST0(CPUX86State *env, target_ulong ptr) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); int v; target_ulong mem_ref, mem_end; int64_t val; @@ -1136,7 +1136,7 @@ static const struct f2xm1_data f2xm1_table[65] = { void helper_f2xm1(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); uint64_t sig = extractFloatx80Frac(ST0); int32_t exp = extractFloatx80Exp(ST0); bool sign = extractFloatx80Sign(ST0); @@ -1369,7 +1369,7 @@ static const struct fpatan_data fpatan_table[9] = { void helper_fpatan(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); uint64_t arg0_sig = extractFloatx80Frac(ST0); int32_t arg0_exp = extractFloatx80Exp(ST0); bool arg0_sign = extractFloatx80Sign(ST0); @@ -1808,7 +1808,7 @@ void helper_fpatan(CPUX86State *env) void helper_fxtract(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); CPU_LDoubleU temp; temp.d = ST0; @@ -1857,7 +1857,7 @@ void helper_fxtract(CPUX86State *env) static void helper_fprem_common(CPUX86State *env, bool mod) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); uint64_t quotient; CPU_LDoubleU temp0, temp1; int exp0, exp1, expdiff; @@ -2053,7 +2053,7 @@ static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp, void helper_fyl2xp1(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); uint64_t arg0_sig = extractFloatx80Frac(ST0); int32_t arg0_exp = extractFloatx80Exp(ST0); bool arg0_sign = extractFloatx80Sign(ST0); @@ -2151,7 +2151,7 @@ void helper_fyl2xp1(CPUX86State *env) void helper_fyl2x(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); uint64_t arg0_sig = extractFloatx80Frac(ST0); int32_t arg0_exp = extractFloatx80Exp(ST0); bool arg0_sign = extractFloatx80Sign(ST0); @@ -2298,7 +2298,7 @@ void helper_fyl2x(CPUX86State *env) void helper_fsqrt(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); if (floatx80_is_neg(ST0)) { env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */ env->fpus |= 0x400; @@ -2324,14 +2324,14 @@ void helper_fsincos(CPUX86State *env) void helper_frndint(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); ST0 = floatx80_round_to_int(ST0, &env->fp_status); merge_exception_flags(env, old_flags); } void helper_fscale(CPUX86State *env) { - uint8_t old_flags = save_exception_flags(env); + int old_flags = save_exception_flags(env); if (floatx80_invalid_encoding(ST1, &env->fp_status) || floatx80_invalid_encoding(ST0, &env->fp_status)) { float_raise(float_flag_invalid, &env->fp_status); @@ -2369,7 +2369,7 @@ void helper_fscale(CPUX86State *env) } else { int n; FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision; - uint8_t save_flags = get_float_exception_flags(&env->fp_status); + int save_flags = get_float_exception_flags(&env->fp_status); set_float_exception_flags(0, &env->fp_status); n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status); set_float_exception_flags(save_flags, &env->fp_status); @@ -3254,6 +3254,7 @@ void update_mxcsr_status(CPUX86State *env) /* Set exception flags. */ set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) | + (mxcsr & FPUS_DE ? float_flag_input_denormal_used : 0) | (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) | (mxcsr & FPUS_OE ? float_flag_overflow : 0) | (mxcsr & FPUS_UE ? float_flag_underflow : 0) | @@ -3269,15 +3270,9 @@ void update_mxcsr_status(CPUX86State *env) void update_mxcsr_from_sse_status(CPUX86State *env) { - uint8_t flags = get_float_exception_flags(&env->sse_status); - /* - * The MXCSR denormal flag has opposite semantics to - * float_flag_input_denormal_flushed (the softfloat code sets that flag - * only when flushing input denormals to zero, but SSE sets it - * only when not flushing them to zero), so is not converted - * here. - */ + int flags = get_float_exception_flags(&env->sse_status); env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) | + (flags & float_flag_input_denormal_used ? FPUS_DE : 0) | (flags & float_flag_divbyzero ? FPUS_ZE : 0) | (flags & float_flag_overflow ? FPUS_OE : 0) | (flags & float_flag_underflow ? FPUS_UE : 0) | diff --git a/tests/tcg/x86_64/fma.c b/tests/tcg/x86_64/fma.c index 09c622e..3421961 100644 --- a/tests/tcg/x86_64/fma.c +++ b/tests/tcg/x86_64/fma.c @@ -79,14 +79,21 @@ static testdata tests[] = { /* * Flushing of denormal outputs to zero should also happen after * rounding, so setting FTZ should not affect the result or the flags. - * QEMU currently does not emulate this correctly because we do the - * flush-to-zero check before rounding, so we incorrectly produce a - * zero result and set Underflow as well as Precision. */ -#ifdef ENABLE_FAILING_TESTS { 0x3fdfffffffffffff, 0x001fffffffffffff, 0x801fffffffffffff, true, 0x8010000000000000, 0x20 }, /* Enabling FTZ shouldn't change flags */ -#endif + /* + * normal * 0 + a denormal. With FTZ disabled this gives an exact + * result (equal to the input denormal) that has consumed the denormal. + */ + { 0x3cc8000000000000, 0x0000000000000000, 0x8008000000000000, false, + 0x8008000000000000, 0x2 }, /* Denormal */ + /* + * With FTZ enabled, this consumes the denormal, returns zero (because + * flushed) and indicates also Underflow and Precision. + */ + { 0x3cc8000000000000, 0x0000000000000000, 0x8008000000000000, true, + 0x8000000000000000, 0x32 }, /* Precision, Underflow, Denormal */ }; int main(void) diff --git a/tests/unit/test-aio-multithread.c b/tests/unit/test-aio-multithread.c index 08d4570..0ead6bf 100644 --- a/tests/unit/test-aio-multithread.c +++ b/tests/unit/test-aio-multithread.c @@ -305,7 +305,9 @@ static void mcs_mutex_lock(void) prev = qatomic_xchg(&mutex_head, id); if (prev != -1) { qatomic_set(&nodes[prev].next, id); - qemu_futex_wait(&nodes[id].locked, 1); + while (qatomic_read(&nodes[id].locked) == 1) { + qemu_futex_wait(&nodes[id].locked, 1); + } } } @@ -328,7 +330,7 @@ static void mcs_mutex_unlock(void) /* Wake up the next in line. */ next = qatomic_read(&nodes[id].next); nodes[next].locked = 0; - qemu_futex_wake(&nodes[next].locked, 1); + qemu_futex_wake_single(&nodes[next].locked); } static void test_multi_fair_mutex_entry(void *opaque) diff --git a/util/error.c b/util/error.c index 673011b..daea214 100644 --- a/util/error.c +++ b/util/error.c @@ -15,15 +15,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu/error-report.h" - -struct Error -{ - char *msg; - ErrorClass err_class; - const char *src, *func; - int line; - GString *hint; -}; +#include "qapi/error-internal.h" Error *error_abort; Error *error_fatal; @@ -32,8 +24,13 @@ Error *error_warn; static void error_handle(Error **errp, Error *err) { if (errp == &error_abort) { - fprintf(stderr, "Unexpected error in %s() at %s:%d:\n", - err->func, err->src, err->line); + if (err->func) { + fprintf(stderr, "Unexpected error in %s() at %.*s:%d:\n", + err->func, err->src_len, err->src, err->line); + } else { + fprintf(stderr, "Unexpected error at %.*s:%d:\n", + err->src_len, err->src, err->line); + } error_report("%s", error_get_pretty(err)); if (err->hint) { error_printf("%s", err->hint->str); @@ -75,6 +72,7 @@ static void error_setv(Error **errp, g_free(msg); } err->err_class = err_class; + err->src_len = -1; err->src = src; err->line = line; err->func = func; diff --git a/util/event.c b/util/event.c new file mode 100644 index 0000000..5a8141c --- /dev/null +++ b/util/event.c @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "qemu/osdep.h" +#include "qemu/thread.h" + +/* + * Valid transitions: + * - FREE -> SET (qemu_event_set) + * - BUSY -> SET (qemu_event_set) + * - SET -> FREE (qemu_event_reset) + * - FREE -> BUSY (qemu_event_wait) + * + * With futex, the waking and blocking operations follow + * BUSY -> SET and FREE -> BUSY, respectively. + * + * Without futex, BUSY -> SET and FREE -> BUSY never happen. Instead, the waking + * operation follows FREE -> SET and the blocking operation will happen in + * qemu_event_wait() if the event is not SET. + * + * SET->BUSY does not happen (it can be observed from the outside but + * it really is SET->FREE->BUSY). + * + * busy->free provably cannot happen; to enforce it, the set->free transition + * is done with an OR, which becomes a no-op if the event has concurrently + * transitioned to free or busy. + */ + +#define EV_SET 0 +#define EV_FREE 1 +#define EV_BUSY -1 + +void qemu_event_init(QemuEvent *ev, bool init) +{ +#ifndef HAVE_FUTEX + pthread_mutex_init(&ev->lock, NULL); + pthread_cond_init(&ev->cond, NULL); +#endif + + ev->value = (init ? EV_SET : EV_FREE); + ev->initialized = true; +} + +void qemu_event_destroy(QemuEvent *ev) +{ + assert(ev->initialized); + ev->initialized = false; +#ifndef HAVE_FUTEX + pthread_mutex_destroy(&ev->lock); + pthread_cond_destroy(&ev->cond); +#endif +} + +void qemu_event_set(QemuEvent *ev) +{ + assert(ev->initialized); + +#ifdef HAVE_FUTEX + /* + * Pairs with both qemu_event_reset() and qemu_event_wait(). + * + * qemu_event_set has release semantics, but because it *loads* + * ev->value we need a full memory barrier here. + */ + smp_mb(); + if (qatomic_read(&ev->value) != EV_SET) { + int old = qatomic_xchg(&ev->value, EV_SET); + + /* Pairs with memory barrier in kernel futex_wait system call. */ + smp_mb__after_rmw(); + if (old == EV_BUSY) { + /* There were waiters, wake them up. */ + qemu_futex_wake_all(ev); + } + } +#else + pthread_mutex_lock(&ev->lock); + /* Pairs with qemu_event_reset()'s load acquire. */ + qatomic_store_release(&ev->value, EV_SET); + pthread_cond_broadcast(&ev->cond); + pthread_mutex_unlock(&ev->lock); +#endif +} + +void qemu_event_reset(QemuEvent *ev) +{ + assert(ev->initialized); + +#ifdef HAVE_FUTEX + /* + * If there was a concurrent reset (or even reset+wait), + * do nothing. Otherwise change EV_SET->EV_FREE. + */ + qatomic_or(&ev->value, EV_FREE); + + /* + * Order reset before checking the condition in the caller. + * Pairs with the first memory barrier in qemu_event_set(). + */ + smp_mb__after_rmw(); +#else + /* + * If futexes are not available, there are no EV_FREE->EV_BUSY + * transitions because wakeups are done entirely through the + * condition variable. Since qatomic_set() only writes EV_FREE, + * the load seems useless but in reality, the acquire synchronizes + * with qemu_event_set()'s store release: if qemu_event_reset() + * sees EV_SET here, then the caller will certainly see a + * successful condition and skip qemu_event_wait(): + * + * done = 1; if (done == 0) + * qemu_event_set() { qemu_event_reset() { + * lock(); + * ev->value = EV_SET -----> load ev->value + * ev->value = old value | EV_FREE + * cond_broadcast() + * unlock(); } + * } if (done == 0) + * // qemu_event_wait() not called + */ + qatomic_set(&ev->value, qatomic_load_acquire(&ev->value) | EV_FREE); +#endif +} + +void qemu_event_wait(QemuEvent *ev) +{ + assert(ev->initialized); + +#ifdef HAVE_FUTEX + while (true) { + /* + * qemu_event_wait must synchronize with qemu_event_set even if it does + * not go down the slow path, so this load-acquire is needed that + * synchronizes with the first memory barrier in qemu_event_set(). + */ + unsigned value = qatomic_load_acquire(&ev->value); + if (value == EV_SET) { + break; + } + + if (value == EV_FREE) { + /* + * Leave the event reset and tell qemu_event_set that there are + * waiters. No need to retry, because there cannot be a concurrent + * busy->free transition. After the CAS, the event will be either + * set or busy. + * + * This cmpxchg doesn't have particular ordering requirements if it + * succeeds (moving the store earlier can only cause + * qemu_event_set() to issue _more_ wakeups), the failing case needs + * acquire semantics like the load above. + */ + if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) { + break; + } + } + + /* + * This is the final check for a concurrent set, so it does need + * a smp_mb() pairing with the second barrier of qemu_event_set(). + * The barrier is inside the FUTEX_WAIT system call. + */ + qemu_futex_wait(ev, EV_BUSY); + } +#else + pthread_mutex_lock(&ev->lock); + while (qatomic_read(&ev->value) != EV_SET) { + pthread_cond_wait(&ev->cond, &ev->lock); + } + pthread_mutex_unlock(&ev->lock); +#endif +} diff --git a/util/lockcnt.c b/util/lockcnt.c index d07c6cc..92c9f8c 100644 --- a/util/lockcnt.c +++ b/util/lockcnt.c @@ -12,10 +12,11 @@ #include "qemu/atomic.h" #include "trace.h" -#ifdef CONFIG_LINUX -#include "qemu/futex.h" +#ifdef HAVE_FUTEX -/* On Linux, bits 0-1 are a futex-based lock, bits 2-31 are the counter. +/* + * When futex is available, bits 0-1 are a futex-based lock, bits 2-31 are the + * counter. * For the mutex algorithm see Ulrich Drepper's "Futexes Are Tricky" (ok, * this is not the most relaxing citation I could make...). It is similar * to mutex2 in the paper. @@ -106,7 +107,7 @@ static bool qemu_lockcnt_cmpxchg_or_wait(QemuLockCnt *lockcnt, int *val, static void lockcnt_wake(QemuLockCnt *lockcnt) { trace_lockcnt_futex_wake(lockcnt); - qemu_futex_wake(&lockcnt->count, 1); + qemu_futex_wake_single(&lockcnt->count); } void qemu_lockcnt_inc(QemuLockCnt *lockcnt) diff --git a/util/meson.build b/util/meson.build index 1adff96..3502938 100644 --- a/util/meson.build +++ b/util/meson.build @@ -27,7 +27,7 @@ else util_ss.add(files('event_notifier-win32.c')) util_ss.add(files('oslib-win32.c')) util_ss.add(files('qemu-thread-win32.c')) - util_ss.add(winmm, pathcch) + util_ss.add(winmm, pathcch, synchronization) endif util_ss.add(when: linux_io_uring, if_true: files('fdmon-io_uring.c')) if glib_has_gslice @@ -35,6 +35,7 @@ if glib_has_gslice endif util_ss.add(files('defer-call.c')) util_ss.add(files('envlist.c', 'path.c', 'module.c')) +util_ss.add(files('event.c')) util_ss.add(files('host-utils.c')) util_ss.add(files('bitmap.c', 'bitops.c')) util_ss.add(files('fifo8.c')) diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c index b2e26e2..ba72544 100644 --- a/util/qemu-thread-posix.c +++ b/util/qemu-thread-posix.c @@ -317,154 +317,6 @@ void qemu_sem_wait(QemuSemaphore *sem) qemu_mutex_unlock(&sem->mutex); } -#ifdef __linux__ -#include "qemu/futex.h" -#else -static inline void qemu_futex_wake(QemuEvent *ev, int n) -{ - assert(ev->initialized); - pthread_mutex_lock(&ev->lock); - if (n == 1) { - pthread_cond_signal(&ev->cond); - } else { - pthread_cond_broadcast(&ev->cond); - } - pthread_mutex_unlock(&ev->lock); -} - -static inline void qemu_futex_wait(QemuEvent *ev, unsigned val) -{ - assert(ev->initialized); - pthread_mutex_lock(&ev->lock); - if (ev->value == val) { - pthread_cond_wait(&ev->cond, &ev->lock); - } - pthread_mutex_unlock(&ev->lock); -} -#endif - -/* Valid transitions: - * - free->set, when setting the event - * - busy->set, when setting the event, followed by qemu_futex_wake - * - set->free, when resetting the event - * - free->busy, when waiting - * - * set->busy does not happen (it can be observed from the outside but - * it really is set->free->busy). - * - * busy->free provably cannot happen; to enforce it, the set->free transition - * is done with an OR, which becomes a no-op if the event has concurrently - * transitioned to free or busy. - */ - -#define EV_SET 0 -#define EV_FREE 1 -#define EV_BUSY -1 - -void qemu_event_init(QemuEvent *ev, bool init) -{ -#ifndef __linux__ - pthread_mutex_init(&ev->lock, NULL); - pthread_cond_init(&ev->cond, NULL); -#endif - - ev->value = (init ? EV_SET : EV_FREE); - ev->initialized = true; -} - -void qemu_event_destroy(QemuEvent *ev) -{ - assert(ev->initialized); - ev->initialized = false; -#ifndef __linux__ - pthread_mutex_destroy(&ev->lock); - pthread_cond_destroy(&ev->cond); -#endif -} - -void qemu_event_set(QemuEvent *ev) -{ - assert(ev->initialized); - - /* - * Pairs with both qemu_event_reset() and qemu_event_wait(). - * - * qemu_event_set has release semantics, but because it *loads* - * ev->value we need a full memory barrier here. - */ - smp_mb(); - if (qatomic_read(&ev->value) != EV_SET) { - int old = qatomic_xchg(&ev->value, EV_SET); - - /* Pairs with memory barrier in kernel futex_wait system call. */ - smp_mb__after_rmw(); - if (old == EV_BUSY) { - /* There were waiters, wake them up. */ - qemu_futex_wake(ev, INT_MAX); - } - } -} - -void qemu_event_reset(QemuEvent *ev) -{ - assert(ev->initialized); - - /* - * If there was a concurrent reset (or even reset+wait), - * do nothing. Otherwise change EV_SET->EV_FREE. - */ - qatomic_or(&ev->value, EV_FREE); - - /* - * Order reset before checking the condition in the caller. - * Pairs with the first memory barrier in qemu_event_set(). - */ - smp_mb__after_rmw(); -} - -void qemu_event_wait(QemuEvent *ev) -{ - unsigned value; - - assert(ev->initialized); - - /* - * qemu_event_wait must synchronize with qemu_event_set even if it does - * not go down the slow path, so this load-acquire is needed that - * synchronizes with the first memory barrier in qemu_event_set(). - * - * If we do go down the slow path, there is no requirement at all: we - * might miss a qemu_event_set() here but ultimately the memory barrier in - * qemu_futex_wait() will ensure the check is done correctly. - */ - value = qatomic_load_acquire(&ev->value); - if (value != EV_SET) { - if (value == EV_FREE) { - /* - * Leave the event reset and tell qemu_event_set that there are - * waiters. No need to retry, because there cannot be a concurrent - * busy->free transition. After the CAS, the event will be either - * set or busy. - * - * This cmpxchg doesn't have particular ordering requirements if it - * succeeds (moving the store earlier can only cause qemu_event_set() - * to issue _more_ wakeups), the failing case needs acquire semantics - * like the load above. - */ - if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) { - return; - } - } - - /* - * This is the final check for a concurrent set, so it does need - * a smp_mb() pairing with the second barrier of qemu_event_set(). - * The barrier is inside the FUTEX_WAIT system call. - */ - qemu_futex_wait(ev, EV_BUSY); - } -} - static __thread NotifierList thread_exit; /* diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c index a7fe3cc..ca2e0b5 100644 --- a/util/qemu-thread-win32.c +++ b/util/qemu-thread-win32.c @@ -231,135 +231,6 @@ void qemu_sem_wait(QemuSemaphore *sem) } } -/* Wrap a Win32 manual-reset event with a fast userspace path. The idea - * is to reset the Win32 event lazily, as part of a test-reset-test-wait - * sequence. Such a sequence is, indeed, how QemuEvents are used by - * RCU and other subsystems! - * - * Valid transitions: - * - free->set, when setting the event - * - busy->set, when setting the event, followed by SetEvent - * - set->free, when resetting the event - * - free->busy, when waiting - * - * set->busy does not happen (it can be observed from the outside but - * it really is set->free->busy). - * - * busy->free provably cannot happen; to enforce it, the set->free transition - * is done with an OR, which becomes a no-op if the event has concurrently - * transitioned to free or busy (and is faster than cmpxchg). - */ - -#define EV_SET 0 -#define EV_FREE 1 -#define EV_BUSY -1 - -void qemu_event_init(QemuEvent *ev, bool init) -{ - /* Manual reset. */ - ev->event = CreateEvent(NULL, TRUE, TRUE, NULL); - ev->value = (init ? EV_SET : EV_FREE); - ev->initialized = true; -} - -void qemu_event_destroy(QemuEvent *ev) -{ - assert(ev->initialized); - ev->initialized = false; - CloseHandle(ev->event); -} - -void qemu_event_set(QemuEvent *ev) -{ - assert(ev->initialized); - - /* - * Pairs with both qemu_event_reset() and qemu_event_wait(). - * - * qemu_event_set has release semantics, but because it *loads* - * ev->value we need a full memory barrier here. - */ - smp_mb(); - if (qatomic_read(&ev->value) != EV_SET) { - int old = qatomic_xchg(&ev->value, EV_SET); - - /* Pairs with memory barrier after ResetEvent. */ - smp_mb__after_rmw(); - if (old == EV_BUSY) { - /* There were waiters, wake them up. */ - SetEvent(ev->event); - } - } -} - -void qemu_event_reset(QemuEvent *ev) -{ - assert(ev->initialized); - - /* - * If there was a concurrent reset (or even reset+wait), - * do nothing. Otherwise change EV_SET->EV_FREE. - */ - qatomic_or(&ev->value, EV_FREE); - - /* - * Order reset before checking the condition in the caller. - * Pairs with the first memory barrier in qemu_event_set(). - */ - smp_mb__after_rmw(); -} - -void qemu_event_wait(QemuEvent *ev) -{ - unsigned value; - - assert(ev->initialized); - - /* - * qemu_event_wait must synchronize with qemu_event_set even if it does - * not go down the slow path, so this load-acquire is needed that - * synchronizes with the first memory barrier in qemu_event_set(). - * - * If we do go down the slow path, there is no requirement at all: we - * might miss a qemu_event_set() here but ultimately the memory barrier in - * qemu_futex_wait() will ensure the check is done correctly. - */ - value = qatomic_load_acquire(&ev->value); - if (value != EV_SET) { - if (value == EV_FREE) { - /* - * Here the underlying kernel event is reset, but qemu_event_set is - * not yet going to call SetEvent. However, there will be another - * check for EV_SET below when setting EV_BUSY. At that point it - * is safe to call WaitForSingleObject. - */ - ResetEvent(ev->event); - - /* - * It is not clear whether ResetEvent provides this barrier; kernel - * APIs (KeResetEvent/KeClearEvent) do not. Better safe than sorry! - */ - smp_mb(); - - /* - * Leave the event reset and tell qemu_event_set that there are - * waiters. No need to retry, because there cannot be a concurrent - * busy->free transition. After the CAS, the event will be either - * set or busy. - */ - if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) { - return; - } - } - - /* - * ev->value is now EV_BUSY. Since we didn't observe EV_SET, - * qemu_event_set() must observe EV_BUSY and call SetEvent(). - */ - WaitForSingleObject(ev->event, INFINITE); - } -} - struct QemuThreadData { /* Passed to win32_start_routine. */ void *(*start_routine)(void *); |