diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2022-06-15 09:47:24 -0700 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2022-06-15 09:47:24 -0700 |
commit | 9ac873a46963098441be920ef7a2eaf244a3352d (patch) | |
tree | 86bb7301ad0f534513a53984d0d3c17a1740f453 | |
parent | 8e6c70b9d4a1b1f3011805947925cfdb31642f7f (diff) | |
parent | 99b969fbe105117f5af6060d3afef40ca39cc9c1 (diff) | |
download | qemu-9ac873a46963098441be920ef7a2eaf244a3352d.zip qemu-9ac873a46963098441be920ef7a2eaf244a3352d.tar.gz qemu-9ac873a46963098441be920ef7a2eaf244a3352d.tar.bz2 |
Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into staging
Pull request
This pull request includes an important aio=native I/O stall fix, the
experimental vifo-user server, the io_uring_register_ring_fd() optimization for
aio=io_uring, and an update to Vladimir Sementsov-Ogievskiy's maintainership
details.
# -----BEGIN PGP SIGNATURE-----
#
# iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAmKp/+AACgkQnKSrs4Gr
# c8gg9wf/ZG1+eGR2NA0T1szlhtgy2bnp95hrLbKzP7tVxueFq7QCcsIsLGWqfnMd
# RREUi6Tgx1v7Agk2oIyUcrjn5rt4LPVOKolVbK6e5Pyou2/Sf/ApkhRjRnzzfACE
# J56H8gPU7fS4/8sJYCYGlWEr7pMmJMVJFPl2tNsErPwuZMSjo27n6UqDE/ZSZF1p
# w1a+cwo+6YSjtJg4AFB/+izBam4+U6w1YhgZM6p6hx5a7GLoq/w59W6Yb119GANO
# tg5qzmSHtMKTieORJmYAt83T1xS5d/iyca4w1PiYQxJsHsqwAaPpoyEhgGT+u+CA
# hfb3HDdQCFyVKwlKD5H1a+WD/Hr11w==
# =zcl8
# -----END PGP SIGNATURE-----
# gpg: Signature made Wed 15 Jun 2022 08:50:56 AM PDT
# gpg: using RSA key 8695A8BFD3F97CDAAC35775A9CA4ABB381AB73C8
# gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" [full]
# gpg: aka "Stefan Hajnoczi <stefanha@gmail.com>" [full]
* tag 'block-pull-request' of https://gitlab.com/stefanha/qemu:
linux-aio: explain why max batch is checked in laio_io_unplug()
linux-aio: fix unbalanced plugged counter in laio_io_unplug()
vfio-user: handle reset of remote device
vfio-user: handle device interrupts
vfio-user: handle PCI BAR accesses
vfio-user: handle DMA mappings
vfio-user: IOMMU support for remote device
vfio-user: handle PCI config space accesses
vfio-user: run vfio-user context
vfio-user: find and init PCI device
vfio-user: instantiate vfio-user context
vfio-user: define vfio-user-server object
vfio-user: build library
remote/machine: add vfio-user property
remote/machine: add HotplugHandler for remote machine
qdev: unplug blocker for devices
Use io_uring_register_ring_fd() to skip fd operations
MAINTAINERS: update Vladimir's address and repositories
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
37 files changed, 1564 insertions, 31 deletions
diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml index cb7cad4..8a4353e 100644 --- a/.gitlab-ci.d/buildtest.yml +++ b/.gitlab-ci.d/buildtest.yml @@ -168,6 +168,7 @@ build-system-centos: IMAGE: centos8 CONFIGURE_ARGS: --disable-nettle --enable-gcrypt --enable-fdt=system --enable-modules --enable-trace-backends=dtrace --enable-docs + --enable-vfio-user-server TARGETS: ppc64-softmmu or1k-softmmu s390x-softmmu x86_64-softmmu rx-softmmu sh4-softmmu nios2-softmmu MAKE_CHECK_ARGS: check-build diff --git a/.gitmodules b/.gitmodules index b8bff47..aedd9a0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -64,3 +64,6 @@ [submodule "tests/lcitool/libvirt-ci"] path = tests/lcitool/libvirt-ci url = https://gitlab.com/libvirt/libvirt-ci.git +[submodule "subprojects/libvfio-user"] + path = subprojects/libvfio-user + url = https://gitlab.com/qemu-project/libvfio-user.git diff --git a/Kconfig.host b/Kconfig.host index 1165c4e..d763d89 100644 --- a/Kconfig.host +++ b/Kconfig.host @@ -42,3 +42,7 @@ config MULTIPROCESS_ALLOWED config FUZZ bool select SPARSE_MEM + +config VFIO_USER_SERVER_ALLOWED + bool + imply VFIO_USER_SERVER diff --git a/MAINTAINERS b/MAINTAINERS index 4cf6174..aaa649a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2546,7 +2546,7 @@ F: scsi/* Block Jobs M: John Snow <jsnow@redhat.com> -M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> +M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> L: qemu-block@nongnu.org S: Supported F: blockjob.c @@ -2571,7 +2571,7 @@ F: block/aio_task.c F: util/qemu-co-shared-resource.c F: include/qemu/co-shared-resource.h T: git https://gitlab.com/jsnow/qemu.git jobs -T: git https://src.openvz.org/scm/~vsementsov/qemu.git jobs +T: git https://gitlab.com/vsementsov/qemu.git block Block QAPI, monitor, command line M: Markus Armbruster <armbru@redhat.com> @@ -2592,7 +2592,7 @@ F: include/hw/cxl/ Dirty Bitmaps M: Eric Blake <eblake@redhat.com> -M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> +M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> R: John Snow <jsnow@redhat.com> L: qemu-block@nongnu.org S: Supported @@ -2606,6 +2606,7 @@ F: util/hbitmap.c F: tests/unit/test-hbitmap.c F: docs/interop/bitmaps.rst T: git https://repo.or.cz/qemu/ericb.git bitmaps +T: git https://gitlab.com/vsementsov/qemu.git block Character device backends M: Marc-André Lureau <marcandre.lureau@redhat.com> @@ -2816,16 +2817,17 @@ F: scripts/*.py F: tests/*.py Benchmark util -M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> +M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> S: Maintained F: scripts/simplebench/ -T: git https://src.openvz.org/scm/~vsementsov/qemu.git simplebench +T: git https://gitlab.com/vsementsov/qemu.git simplebench Transactions helper -M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> +M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> S: Maintained F: include/qemu/transactions.h F: util/transactions.c +T: git https://gitlab.com/vsementsov/qemu.git block QAPI M: Markus Armbruster <armbru@redhat.com> @@ -3402,7 +3404,7 @@ F: block/iscsi-opts.c Network Block Device (NBD) M: Eric Blake <eblake@redhat.com> -M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> +M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> L: qemu-block@nongnu.org S: Maintained F: block/nbd* @@ -3414,7 +3416,7 @@ F: docs/interop/nbd.txt F: docs/tools/qemu-nbd.rst F: tests/qemu-iotests/tests/*nbd* T: git https://repo.or.cz/qemu/ericb.git nbd -T: git https://src.openvz.org/scm/~vsementsov/qemu.git nbd +T: git https://gitlab.com/vsementsov/qemu.git block NFS M: Peter Lieven <pl@kamp.de> @@ -3499,13 +3501,13 @@ F: block/dmg.c parallels M: Stefan Hajnoczi <stefanha@redhat.com> M: Denis V. Lunev <den@openvz.org> -M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> +M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> L: qemu-block@nongnu.org S: Supported F: block/parallels.c F: block/parallels-ext.c F: docs/interop/parallels.txt -T: git https://src.openvz.org/scm/~vsementsov/qemu.git parallels +T: git https://gitlab.com/vsementsov/qemu.git block qed M: Stefan Hajnoczi <stefanha@redhat.com> @@ -3640,6 +3642,11 @@ F: hw/remote/proxy-memory-listener.c F: include/hw/remote/proxy-memory-listener.h F: hw/remote/iohub.c F: include/hw/remote/iohub.h +F: subprojects/libvfio-user +F: hw/remote/vfio-user-obj.c +F: include/hw/remote/vfio-user-obj.h +F: hw/remote/iommu.c +F: include/hw/remote/iommu.h EBPF: M: Jason Wang <jasowang@redhat.com> diff --git a/block/io_uring.c b/block/io_uring.c index 0b40151..d48e472 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -18,6 +18,7 @@ #include "qapi/error.h" #include "trace.h" + /* io_uring ring size */ #define MAX_ENTRIES 128 @@ -434,8 +435,17 @@ LuringState *luring_init(Error **errp) } ioq_init(&s->io_q); - return s; +#ifdef CONFIG_LIBURING_REGISTER_RING_FD + if (io_uring_register_ring_fd(&s->ring) < 0) { + /* + * Only warn about this error: we will fallback to the non-optimized + * io_uring operations. + */ + warn_report("failed to register linux io_uring ring file descriptor"); + } +#endif + return s; } void luring_cleanup(LuringState *s) diff --git a/block/linux-aio.c b/block/linux-aio.c index 4c423fc..9c2393a 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -363,8 +363,16 @@ void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s, uint64_t dev_max_batch) { assert(s->io_q.plugged); + s->io_q.plugged--; + + /* + * Why max batch checking is performed here: + * Another BDS may have queued requests with a higher dev_max_batch and + * therefore in_queue could now exceed our dev_max_batch. Re-check the max + * batch so we can honor our device's dev_max_batch. + */ if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) || - (--s->io_q.plugged == 0 && + (!s->io_q.plugged && !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) { ioq_submit(s); } @@ -315,6 +315,7 @@ meson_args="" ninja="" bindir="bin" skip_meson=no +vfio_user_server="disabled" # The following Meson options are handled manually (still they # are included in the automatically generated help message) @@ -909,6 +910,10 @@ for opt do ;; --disable-blobs) meson_option_parse --disable-install-blobs "" ;; + --enable-vfio-user-server) vfio_user_server="enabled" + ;; + --disable-vfio-user-server) vfio_user_server="disabled" + ;; --enable-tcmalloc) meson_option_parse --enable-malloc=tcmalloc tcmalloc ;; --enable-jemalloc) meson_option_parse --enable-malloc=jemalloc jemalloc @@ -2133,6 +2138,17 @@ write_container_target_makefile() { ########################################## +# check for vfio_user_server + +case "$vfio_user_server" in + enabled ) + if test "$git_submodules_action" != "ignore"; then + git_submodules="${git_submodules} subprojects/libvfio-user" + fi + ;; +esac + +########################################## # End of CC checks # After here, no more $cc or $ld runs @@ -2672,6 +2688,7 @@ if test "$skip_meson" = no; then test "$slirp" != auto && meson_option_add "-Dslirp=$slirp" test "$smbd" != '' && meson_option_add "-Dsmbd=$smbd" test "$tcg" != enabled && meson_option_add "-Dtcg=$tcg" + test "$vfio_user_server" != auto && meson_option_add "-Dvfio_user_server=$vfio_user_server" run_meson() { NINJA=$ninja $meson setup --prefix "$prefix" "$@" $cross_arg "$PWD" "$source_path" } diff --git a/hw/core/qdev.c b/hw/core/qdev.c index 84f3019..0806d8f 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -468,6 +468,28 @@ char *qdev_get_dev_path(DeviceState *dev) return NULL; } +void qdev_add_unplug_blocker(DeviceState *dev, Error *reason) +{ + dev->unplug_blockers = g_slist_prepend(dev->unplug_blockers, reason); +} + +void qdev_del_unplug_blocker(DeviceState *dev, Error *reason) +{ + dev->unplug_blockers = g_slist_remove(dev->unplug_blockers, reason); +} + +bool qdev_unplug_blocked(DeviceState *dev, Error **errp) +{ + ERRP_GUARD(); + + if (dev->unplug_blockers) { + error_propagate(errp, error_copy(dev->unplug_blockers->data)); + return true; + } + + return false; +} + static bool device_get_realized(Object *obj, Error **errp) { DeviceState *dev = DEVICE(obj); @@ -704,6 +726,8 @@ static void device_finalize(Object *obj) DeviceState *dev = DEVICE(obj); + g_assert(!dev->unplug_blockers); + QLIST_FOREACH_SAFE(ngl, &dev->gpios, node, next) { QLIST_REMOVE(ngl, node); qemu_free_irqs(ngl->in, ngl->num_in); diff --git a/hw/pci/msi.c b/hw/pci/msi.c index 47d2b0f..5c471b9 100644 --- a/hw/pci/msi.c +++ b/hw/pci/msi.c @@ -134,7 +134,7 @@ void msi_set_message(PCIDevice *dev, MSIMessage msg) pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data); } -MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector) +static MSIMessage msi_prepare_message(PCIDevice *dev, unsigned int vector) { uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev)); bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; @@ -159,6 +159,11 @@ MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector) return msg; } +MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector) +{ + return dev->msi_prepare_message(dev, vector); +} + bool msi_enabled(const PCIDevice *dev) { return msi_present(dev) && @@ -241,6 +246,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset, 0xffffffff >> (PCI_MSI_VECTORS_MAX - nr_vectors)); } + dev->msi_prepare_message = msi_prepare_message; + return 0; } @@ -256,6 +263,7 @@ void msi_uninit(struct PCIDevice *dev) cap_size = msi_cap_sizeof(flags); pci_del_capability(dev, PCI_CAP_ID_MSI, cap_size); dev->cap_present &= ~QEMU_PCI_CAP_MSI; + dev->msi_prepare_message = NULL; MSI_DEV_PRINTF(dev, "uninit\n"); } @@ -307,6 +315,39 @@ bool msi_is_masked(const PCIDevice *dev, unsigned int vector) return mask & (1U << vector); } +void msi_set_mask(PCIDevice *dev, int vector, bool mask, Error **errp) +{ + ERRP_GUARD(); + uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev)); + bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; + uint32_t irq_state, vector_mask, pending; + + if (vector > PCI_MSI_VECTORS_MAX) { + error_setg(errp, "msi: vector %d not allocated. max vector is %d", + vector, PCI_MSI_VECTORS_MAX); + return; + } + + vector_mask = (1U << vector); + + irq_state = pci_get_long(dev->config + msi_mask_off(dev, msi64bit)); + + if (mask) { + irq_state |= vector_mask; + } else { + irq_state &= ~vector_mask; + } + + pci_set_long(dev->config + msi_mask_off(dev, msi64bit), irq_state); + + pending = pci_get_long(dev->config + msi_pending_off(dev, msi64bit)); + if (!mask && (pending & vector_mask)) { + pending &= ~vector_mask; + pci_set_long(dev->config + msi_pending_off(dev, msi64bit), pending); + msi_notify(dev, vector); + } +} + void msi_notify(PCIDevice *dev, unsigned int vector) { uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev)); @@ -334,11 +375,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector) void msi_send_message(PCIDevice *dev, MSIMessage msg) { - MemTxAttrs attrs = {}; - - attrs.requester_id = pci_requester_id(dev); - address_space_stl_le(&dev->bus_master_as, msg.address, msg.data, - attrs, NULL); + dev->msi_trigger(dev, msg); } /* Normally called by pci_default_write_config(). */ diff --git a/hw/pci/msix.c b/hw/pci/msix.c index ae9331c..1e381a9 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -31,7 +31,7 @@ #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8) #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8) -MSIMessage msix_get_message(PCIDevice *dev, unsigned vector) +static MSIMessage msix_prepare_message(PCIDevice *dev, unsigned vector) { uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE; MSIMessage msg; @@ -41,6 +41,11 @@ MSIMessage msix_get_message(PCIDevice *dev, unsigned vector) return msg; } +MSIMessage msix_get_message(PCIDevice *dev, unsigned vector) +{ + return dev->msix_prepare_message(dev, vector); +} + /* * Special API for POWER to configure the vectors through * a side channel. Should never be used by devices. @@ -131,6 +136,31 @@ static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked) } } +void msix_set_mask(PCIDevice *dev, int vector, bool mask, Error **errp) +{ + ERRP_GUARD(); + unsigned offset; + bool was_masked; + + if (vector > dev->msix_entries_nr) { + error_setg(errp, "msix: vector %d not allocated. max vector is %d", + vector, dev->msix_entries_nr); + return; + } + + offset = vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL; + + was_masked = msix_is_masked(dev, vector); + + if (mask) { + dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT; + } else { + dev->msix_table[offset] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; + } + + msix_handle_mask_update(dev, vector, was_masked); +} + static bool msix_masked(PCIDevice *dev) { return dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK; @@ -344,6 +374,8 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries, "msix-pba", pba_size); memory_region_add_subregion(pba_bar, pba_offset, &dev->msix_pba_mmio); + dev->msix_prepare_message = msix_prepare_message; + return 0; } @@ -429,6 +461,7 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar) g_free(dev->msix_entry_used); dev->msix_entry_used = NULL; dev->cap_present &= ~QEMU_PCI_CAP_MSIX; + dev->msix_prepare_message = NULL; } void msix_uninit_exclusive_bar(PCIDevice *dev) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 6e70153..2f450f6 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -317,6 +317,15 @@ void pci_device_deassert_intx(PCIDevice *dev) } } +static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg) +{ + MemTxAttrs attrs = {}; + + attrs.requester_id = pci_requester_id(dev); + address_space_stl_le(&dev->bus_master_as, msg.address, msg.data, + attrs, NULL); +} + static void pci_reset_regions(PCIDevice *dev) { int r; @@ -1212,6 +1221,8 @@ static void pci_qdev_unrealize(DeviceState *dev) pci_device_deassert_intx(pci_dev); do_pci_unregister_device(pci_dev); + + pci_dev->msi_trigger = NULL; } void pci_register_bar(PCIDevice *pci_dev, int region_num, @@ -2251,6 +2262,8 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) } pci_set_power(pci_dev, true); + + pci_dev->msi_trigger = pci_msi_trigger; } PCIDevice *pci_new_multifunction(int devfn, bool multifunction, diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig index 08c16e2..2d6b4f4 100644 --- a/hw/remote/Kconfig +++ b/hw/remote/Kconfig @@ -2,3 +2,7 @@ config MULTIPROCESS bool depends on PCI && PCI_EXPRESS && KVM select REMOTE_PCIHOST + +config VFIO_USER_SERVER + bool + depends on MULTIPROCESS diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c new file mode 100644 index 0000000..fd723d9 --- /dev/null +++ b/hw/remote/iommu.c @@ -0,0 +1,131 @@ +/** + * IOMMU for remote device + * + * Copyright © 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "hw/remote/iommu.h" +#include "hw/pci/pci_bus.h" +#include "hw/pci/pci.h" +#include "exec/memory.h" +#include "exec/address-spaces.h" +#include "trace.h" + +/** + * IOMMU for TYPE_REMOTE_MACHINE - manages DMA address space isolation + * for remote machine. It is used by TYPE_VFIO_USER_SERVER. + * + * - Each TYPE_VFIO_USER_SERVER instance handles one PCIDevice on a PCIBus. + * There is one RemoteIommu per PCIBus, so the RemoteIommu tracks multiple + * PCIDevices by maintaining a ->elem_by_devfn mapping. + * + * - memory_region_init_iommu() is not used because vfio-user MemoryRegions + * will be added to the elem->mr container instead. This is more natural + * than implementing the IOMMUMemoryRegionClass APIs since vfio-user + * provides something that is close to a full-fledged MemoryRegion and + * not like an IOMMU mapping. + * + * - When a device is hot unplugged, the elem->mr reference is dropped so + * all vfio-user MemoryRegions associated with this vfio-user server are + * destroyed. + */ + +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus, + void *opaque, int devfn) +{ + RemoteIommu *iommu = opaque; + RemoteIommuElem *elem = NULL; + + qemu_mutex_lock(&iommu->lock); + + elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn)); + + if (!elem) { + elem = g_malloc0(sizeof(RemoteIommuElem)); + g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem); + } + + if (!elem->mr) { + elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION)); + memory_region_set_size(elem->mr, UINT64_MAX); + address_space_init(&elem->as, elem->mr, NULL); + } + + qemu_mutex_unlock(&iommu->lock); + + return &elem->as; +} + +void remote_iommu_unplug_dev(PCIDevice *pci_dev) +{ + AddressSpace *as = pci_device_iommu_address_space(pci_dev); + RemoteIommuElem *elem = NULL; + + if (as == &address_space_memory) { + return; + } + + elem = container_of(as, RemoteIommuElem, as); + + address_space_destroy(&elem->as); + + object_unref(elem->mr); + + elem->mr = NULL; +} + +static void remote_iommu_init(Object *obj) +{ + RemoteIommu *iommu = REMOTE_IOMMU(obj); + + iommu->elem_by_devfn = g_hash_table_new_full(NULL, NULL, NULL, g_free); + + qemu_mutex_init(&iommu->lock); +} + +static void remote_iommu_finalize(Object *obj) +{ + RemoteIommu *iommu = REMOTE_IOMMU(obj); + + qemu_mutex_destroy(&iommu->lock); + + g_hash_table_destroy(iommu->elem_by_devfn); + + iommu->elem_by_devfn = NULL; +} + +void remote_iommu_setup(PCIBus *pci_bus) +{ + RemoteIommu *iommu = NULL; + + g_assert(pci_bus); + + iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU)); + + pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu); + + object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu)); + + object_unref(OBJECT(iommu)); +} + +static const TypeInfo remote_iommu_info = { + .name = TYPE_REMOTE_IOMMU, + .parent = TYPE_OBJECT, + .instance_size = sizeof(RemoteIommu), + .instance_init = remote_iommu_init, + .instance_finalize = remote_iommu_finalize, +}; + +static void remote_iommu_register_types(void) +{ + type_register_static(&remote_iommu_info); +} + +type_init(remote_iommu_register_types) diff --git a/hw/remote/machine.c b/hw/remote/machine.c index 92d71d4..75d550d 100644 --- a/hw/remote/machine.c +++ b/hw/remote/machine.c @@ -20,6 +20,11 @@ #include "qapi/error.h" #include "hw/pci/pci_host.h" #include "hw/remote/iohub.h" +#include "hw/remote/iommu.h" +#include "hw/qdev-core.h" +#include "hw/remote/iommu.h" +#include "hw/remote/vfio-user-obj.h" +#include "hw/pci/msi.h" static void remote_machine_init(MachineState *machine) { @@ -49,25 +54,102 @@ static void remote_machine_init(MachineState *machine) pci_host = PCI_HOST_BRIDGE(rem_host); - remote_iohub_init(&s->iohub); + if (s->vfio_user) { + remote_iommu_setup(pci_host->bus); - pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq, - &s->iohub, REMOTE_IOHUB_NB_PIRQS); + msi_nonbroken = true; + + vfu_object_set_bus_irq(pci_host->bus); + } else { + remote_iohub_init(&s->iohub); + + pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq, + &s->iohub, REMOTE_IOHUB_NB_PIRQS); + } + + qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s)); +} + +static bool remote_machine_get_vfio_user(Object *obj, Error **errp) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + return s->vfio_user; +} + +static void remote_machine_set_vfio_user(Object *obj, bool value, Error **errp) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + if (phase_check(PHASE_MACHINE_CREATED)) { + error_setg(errp, "Error enabling vfio-user - machine already created"); + return; + } + + s->vfio_user = value; +} + +static bool remote_machine_get_auto_shutdown(Object *obj, Error **errp) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + return s->auto_shutdown; +} + +static void remote_machine_set_auto_shutdown(Object *obj, bool value, + Error **errp) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + s->auto_shutdown = value; +} + +static void remote_machine_instance_init(Object *obj) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + s->auto_shutdown = true; +} + +static void remote_machine_dev_unplug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + qdev_unrealize(dev); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + remote_iommu_unplug_dev(PCI_DEVICE(dev)); + } } static void remote_machine_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); + HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); mc->init = remote_machine_init; mc->desc = "Experimental remote machine"; + + hc->unplug = remote_machine_dev_unplug_cb; + + object_class_property_add_bool(oc, "vfio-user", + remote_machine_get_vfio_user, + remote_machine_set_vfio_user); + + object_class_property_add_bool(oc, "auto-shutdown", + remote_machine_get_auto_shutdown, + remote_machine_set_auto_shutdown); } static const TypeInfo remote_machine = { .name = TYPE_REMOTE_MACHINE, .parent = TYPE_MACHINE, .instance_size = sizeof(RemoteMachineState), + .instance_init = remote_machine_instance_init, .class_init = remote_machine_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_HOTPLUG_HANDLER }, + { } + } }; static void remote_machine_register_types(void) diff --git a/hw/remote/meson.build b/hw/remote/meson.build index e6a5574..ab25c04 100644 --- a/hw/remote/meson.build +++ b/hw/remote/meson.build @@ -6,6 +6,10 @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c')) remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c')) remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c')) remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c')) +remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c')) + +remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: libvfio_user_dep) specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c')) specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c')) diff --git a/hw/remote/trace-events b/hw/remote/trace-events index 0b23974..c167b3c 100644 --- a/hw/remote/trace-events +++ b/hw/remote/trace-events @@ -2,3 +2,14 @@ mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process" mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process" + +# vfio-user-obj.c +vfu_prop(const char *prop, const char *val) "vfu: setting %s as %s" +vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u -> 0x%x" +vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u <- 0x%x" +vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes" +vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64"" +vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64"" +vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64"" +vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64"" +vfu_interrupt(int pirq) "vfu: sending interrupt to device - PIRQ %d" diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c new file mode 100644 index 0000000..c6cc53a --- /dev/null +++ b/hw/remote/vfio-user-obj.c @@ -0,0 +1,958 @@ +/** + * QEMU vfio-user-server server object + * + * Copyright © 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL-v2, version 2 or later. + * + * See the COPYING file in the top-level directory. + * + */ + +/** + * Usage: add options: + * -machine x-remote,vfio-user=on,auto-shutdown=on + * -device <PCI-device>,id=<pci-dev-id> + * -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>, + * device=<pci-dev-id> + * + * Note that x-vfio-user-server object must be used with x-remote machine only. + * This server could only support PCI devices for now. + * + * type - SocketAddress type - presently "unix" alone is supported. Required + * option + * + * path - named unix socket, it will be created by the server. It is + * a required option + * + * device - id of a device on the server, a required option. PCI devices + * alone are supported presently. + * + * notes - x-vfio-user-server could block IO and monitor during the + * initialization phase. + */ + +#include "qemu/osdep.h" + +#include "qom/object.h" +#include "qom/object_interfaces.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "sysemu/runstate.h" +#include "hw/boards.h" +#include "hw/remote/machine.h" +#include "qapi/error.h" +#include "qapi/qapi-visit-sockets.h" +#include "qapi/qapi-events-misc.h" +#include "qemu/notify.h" +#include "qemu/thread.h" +#include "qemu/main-loop.h" +#include "sysemu/sysemu.h" +#include "libvfio-user.h" +#include "hw/qdev-core.h" +#include "hw/pci/pci.h" +#include "qemu/timer.h" +#include "exec/memory.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "hw/remote/vfio-user-obj.h" + +#define TYPE_VFU_OBJECT "x-vfio-user-server" +OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT) + +/** + * VFU_OBJECT_ERROR - reports an error message. If auto_shutdown + * is set, it aborts the machine on error. Otherwise, it logs an + * error message without aborting. + */ +#define VFU_OBJECT_ERROR(o, fmt, ...) \ + { \ + if (vfu_object_auto_shutdown()) { \ + error_setg(&error_abort, (fmt), ## __VA_ARGS__); \ + } else { \ + error_report((fmt), ## __VA_ARGS__); \ + } \ + } \ + +struct VfuObjectClass { + ObjectClass parent_class; + + unsigned int nr_devs; +}; + +struct VfuObject { + /* private */ + Object parent; + + SocketAddress *socket; + + char *device; + + Error *err; + + Notifier machine_done; + + vfu_ctx_t *vfu_ctx; + + PCIDevice *pci_dev; + + Error *unplug_blocker; + + int vfu_poll_fd; + + MSITriggerFunc *default_msi_trigger; + MSIPrepareMessageFunc *default_msi_prepare_message; + MSIxPrepareMessageFunc *default_msix_prepare_message; +}; + +static void vfu_object_init_ctx(VfuObject *o, Error **errp); + +static bool vfu_object_auto_shutdown(void) +{ + bool auto_shutdown = true; + Error *local_err = NULL; + + if (!current_machine) { + return auto_shutdown; + } + + auto_shutdown = object_property_get_bool(OBJECT(current_machine), + "auto-shutdown", + &local_err); + + /* + * local_err would be set if no such property exists - safe to ignore. + * Unlikely scenario as auto-shutdown is always defined for + * TYPE_REMOTE_MACHINE, and TYPE_VFU_OBJECT only works with + * TYPE_REMOTE_MACHINE + */ + if (local_err) { + auto_shutdown = true; + error_free(local_err); + } + + return auto_shutdown; +} + +static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + VfuObject *o = VFU_OBJECT(obj); + + if (o->vfu_ctx) { + error_setg(errp, "vfu: Unable to set socket property - server busy"); + return; + } + + qapi_free_SocketAddress(o->socket); + + o->socket = NULL; + + visit_type_SocketAddress(v, name, &o->socket, errp); + + if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) { + error_setg(errp, "vfu: Unsupported socket type - %s", + SocketAddressType_str(o->socket->type)); + qapi_free_SocketAddress(o->socket); + o->socket = NULL; + return; + } + + trace_vfu_prop("socket", o->socket->u.q_unix.path); + + vfu_object_init_ctx(o, errp); +} + +static void vfu_object_set_device(Object *obj, const char *str, Error **errp) +{ + VfuObject *o = VFU_OBJECT(obj); + + if (o->vfu_ctx) { + error_setg(errp, "vfu: Unable to set device property - server busy"); + return; + } + + g_free(o->device); + + o->device = g_strdup(str); + + trace_vfu_prop("device", str); + + vfu_object_init_ctx(o, errp); +} + +static void vfu_object_ctx_run(void *opaque) +{ + VfuObject *o = opaque; + const char *vfu_id; + char *vfu_path, *pci_dev_path; + int ret = -1; + + while (ret != 0) { + ret = vfu_run_ctx(o->vfu_ctx); + if (ret < 0) { + if (errno == EINTR) { + continue; + } else if (errno == ENOTCONN) { + vfu_id = object_get_canonical_path_component(OBJECT(o)); + vfu_path = object_get_canonical_path(OBJECT(o)); + g_assert(o->pci_dev); + pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev)); + /* o->device is a required property and is non-NULL here */ + g_assert(o->device); + qapi_event_send_vfu_client_hangup(vfu_id, vfu_path, + o->device, pci_dev_path); + qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL); + o->vfu_poll_fd = -1; + object_unparent(OBJECT(o)); + g_free(vfu_path); + g_free(pci_dev_path); + break; + } else { + VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s", + o->device, strerror(errno)); + break; + } + } + } +} + +static void vfu_object_attach_ctx(void *opaque) +{ + VfuObject *o = opaque; + GPollFD pfds[1]; + int ret; + + qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL); + + pfds[0].fd = o->vfu_poll_fd; + pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR; + +retry_attach: + ret = vfu_attach_ctx(o->vfu_ctx); + if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + /** + * vfu_object_attach_ctx can block QEMU's main loop + * during attach - the monitor and other IO + * could be unresponsive during this time. + */ + (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS); + goto retry_attach; + } else if (ret < 0) { + VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s", + o->device, strerror(errno)); + return; + } + + o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx); + if (o->vfu_poll_fd < 0) { + VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device); + return; + } + + qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o); +} + +static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf, + size_t count, loff_t offset, + const bool is_write) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + uint32_t pci_access_width = sizeof(uint32_t); + size_t bytes = count; + uint32_t val = 0; + char *ptr = buf; + int len; + + /* + * Writes to the BAR registers would trigger an update to the + * global Memory and IO AddressSpaces. But the remote device + * never uses the global AddressSpaces, therefore overlapping + * memory regions are not a problem + */ + while (bytes > 0) { + len = (bytes > pci_access_width) ? pci_access_width : bytes; + if (is_write) { + memcpy(&val, ptr, len); + pci_host_config_write_common(o->pci_dev, offset, + pci_config_size(o->pci_dev), + val, len); + trace_vfu_cfg_write(offset, val); + } else { + val = pci_host_config_read_common(o->pci_dev, offset, + pci_config_size(o->pci_dev), len); + memcpy(ptr, &val, len); + trace_vfu_cfg_read(offset, val); + } + offset += len; + ptr += len; + bytes -= len; + } + + return count; +} + +static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + AddressSpace *dma_as = NULL; + MemoryRegion *subregion = NULL; + g_autofree char *name = NULL; + struct iovec *iov = &info->iova; + + if (!info->vaddr) { + return; + } + + name = g_strdup_printf("mem-%s-%"PRIx64"", o->device, + (uint64_t)info->vaddr); + + subregion = g_new0(MemoryRegion, 1); + + memory_region_init_ram_ptr(subregion, NULL, name, + iov->iov_len, info->vaddr); + + dma_as = pci_device_iommu_address_space(o->pci_dev); + + memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion); + + trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len); +} + +static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + AddressSpace *dma_as = NULL; + MemoryRegion *mr = NULL; + ram_addr_t offset; + + mr = memory_region_from_host(info->vaddr, &offset); + if (!mr) { + return; + } + + dma_as = pci_device_iommu_address_space(o->pci_dev); + + memory_region_del_subregion(dma_as->root, mr); + + object_unparent((OBJECT(mr))); + + trace_vfu_dma_unregister((uint64_t)info->iova.iov_base); +} + +static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset, + hwaddr size, const bool is_write) +{ + uint8_t *ptr = buf; + bool release_lock = false; + uint8_t *ram_ptr = NULL; + MemTxResult result; + int access_size; + uint64_t val; + + if (memory_access_is_direct(mr, is_write)) { + /** + * Some devices expose a PCI expansion ROM, which could be buffer + * based as compared to other regions which are primarily based on + * MemoryRegionOps. memory_region_find() would already check + * for buffer overflow, we don't need to repeat it here. + */ + ram_ptr = memory_region_get_ram_ptr(mr); + + if (is_write) { + memcpy((ram_ptr + offset), buf, size); + } else { + memcpy(buf, (ram_ptr + offset), size); + } + + return 0; + } + + while (size) { + /** + * The read/write logic used below is similar to the ones in + * flatview_read/write_continue() + */ + release_lock = prepare_mmio_access(mr); + + access_size = memory_access_size(mr, size, offset); + + if (is_write) { + val = ldn_he_p(ptr, access_size); + + result = memory_region_dispatch_write(mr, offset, val, + size_memop(access_size), + MEMTXATTRS_UNSPECIFIED); + } else { + result = memory_region_dispatch_read(mr, offset, &val, + size_memop(access_size), + MEMTXATTRS_UNSPECIFIED); + + stn_he_p(ptr, access_size, val); + } + + if (release_lock) { + qemu_mutex_unlock_iothread(); + release_lock = false; + } + + if (result != MEMTX_OK) { + return -1; + } + + size -= access_size; + ptr += access_size; + offset += access_size; + } + + return 0; +} + +static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar, + hwaddr bar_offset, char * const buf, + hwaddr len, const bool is_write) +{ + MemoryRegionSection section = { 0 }; + uint8_t *ptr = (uint8_t *)buf; + MemoryRegion *section_mr = NULL; + uint64_t section_size; + hwaddr section_offset; + hwaddr size = 0; + + while (len) { + section = memory_region_find(pci_dev->io_regions[pci_bar].memory, + bar_offset, len); + + if (!section.mr) { + warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset); + return size; + } + + section_mr = section.mr; + section_offset = section.offset_within_region; + section_size = int128_get64(section.size); + + if (is_write && section_mr->readonly) { + warn_report("vfu: attempting to write to readonly region in " + "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]", + pci_bar, bar_offset, + (bar_offset + section_size)); + memory_region_unref(section_mr); + return size; + } + + if (vfu_object_mr_rw(section_mr, ptr, section_offset, + section_size, is_write)) { + warn_report("vfu: failed to %s " + "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d", + is_write ? "write to" : "read from", bar_offset, + (bar_offset + section_size), pci_bar); + memory_region_unref(section_mr); + return size; + } + + size += section_size; + bar_offset += section_size; + ptr += section_size; + len -= section_size; + + memory_region_unref(section_mr); + } + + return size; +} + +/** + * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs. + * + * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would + * define vfu_object_bar2_handler + */ +#define VFU_OBJECT_BAR_HANDLER(BAR_NO) \ + static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx, \ + char * const buf, size_t count, \ + loff_t offset, const bool is_write) \ + { \ + VfuObject *o = vfu_get_private(vfu_ctx); \ + PCIDevice *pci_dev = o->pci_dev; \ + \ + return vfu_object_bar_rw(pci_dev, BAR_NO, offset, \ + buf, count, is_write); \ + } \ + +VFU_OBJECT_BAR_HANDLER(0) +VFU_OBJECT_BAR_HANDLER(1) +VFU_OBJECT_BAR_HANDLER(2) +VFU_OBJECT_BAR_HANDLER(3) +VFU_OBJECT_BAR_HANDLER(4) +VFU_OBJECT_BAR_HANDLER(5) +VFU_OBJECT_BAR_HANDLER(6) + +static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = { + &vfu_object_bar0_handler, + &vfu_object_bar1_handler, + &vfu_object_bar2_handler, + &vfu_object_bar3_handler, + &vfu_object_bar4_handler, + &vfu_object_bar5_handler, + &vfu_object_bar6_handler, +}; + +/** + * vfu_object_register_bars - Identify active BAR regions of pdev and setup + * callbacks to handle read/write accesses + */ +static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev) +{ + int flags = VFU_REGION_FLAG_RW; + int i; + + for (i = 0; i < PCI_NUM_REGIONS; i++) { + if (!pdev->io_regions[i].size) { + continue; + } + + if ((i == VFU_PCI_DEV_ROM_REGION_IDX) || + pdev->io_regions[i].memory->readonly) { + flags &= ~VFU_REGION_FLAG_WRITE; + } + + vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i, + (size_t)pdev->io_regions[i].size, + vfu_object_bar_handlers[i], + flags, NULL, 0, -1, 0); + + trace_vfu_bar_register(i, pdev->io_regions[i].addr, + pdev->io_regions[i].size); + } +} + +static int vfu_object_map_irq(PCIDevice *pci_dev, int intx) +{ + int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), + pci_dev->devfn); + + return pci_bdf; +} + +static void vfu_object_set_irq(void *opaque, int pirq, int level) +{ + PCIBus *pci_bus = opaque; + PCIDevice *pci_dev = NULL; + vfu_ctx_t *vfu_ctx = NULL; + int pci_bus_num, devfn; + + if (level) { + pci_bus_num = PCI_BUS_NUM(pirq); + devfn = PCI_BDF_TO_DEVFN(pirq); + + /* + * pci_find_device() performs at O(1) if the device is attached + * to the root PCI bus. Whereas, if the device is attached to a + * secondary PCI bus (such as when a root port is involved), + * finding the parent PCI bus could take O(n) + */ + pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn); + + vfu_ctx = pci_dev->irq_opaque; + + g_assert(vfu_ctx); + + vfu_irq_trigger(vfu_ctx, 0); + } +} + +static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev, + unsigned int vector) +{ + MSIMessage msg; + + msg.address = 0; + msg.data = vector; + + return msg; +} + +static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg) +{ + vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque; + + vfu_irq_trigger(vfu_ctx, msg.data); +} + +static void vfu_object_setup_msi_cbs(VfuObject *o) +{ + o->default_msi_trigger = o->pci_dev->msi_trigger; + o->default_msi_prepare_message = o->pci_dev->msi_prepare_message; + o->default_msix_prepare_message = o->pci_dev->msix_prepare_message; + + o->pci_dev->msi_trigger = vfu_object_msi_trigger; + o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg; + o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg; +} + +static void vfu_object_restore_msi_cbs(VfuObject *o) +{ + o->pci_dev->msi_trigger = o->default_msi_trigger; + o->pci_dev->msi_prepare_message = o->default_msi_prepare_message; + o->pci_dev->msix_prepare_message = o->default_msix_prepare_message; +} + +static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start, + uint32_t count, bool mask) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + Error *err = NULL; + uint32_t vector; + + for (vector = start; vector < count; vector++) { + msix_set_mask(o->pci_dev, vector, mask, &err); + if (err) { + VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device, + error_get_pretty(err)); + error_free(err); + err = NULL; + } + } +} + +static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start, + uint32_t count, bool mask) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + Error *err = NULL; + uint32_t vector; + + for (vector = start; vector < count; vector++) { + msi_set_mask(o->pci_dev, vector, mask, &err); + if (err) { + VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device, + error_get_pretty(err)); + error_free(err); + err = NULL; + } + } +} + +static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev) +{ + vfu_ctx_t *vfu_ctx = o->vfu_ctx; + int ret; + + ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); + if (ret < 0) { + return ret; + } + + if (msix_nr_vectors_allocated(pci_dev)) { + ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, + msix_nr_vectors_allocated(pci_dev)); + vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ, + &vfu_msix_irq_state); + } else if (msi_nr_vectors_allocated(pci_dev)) { + ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ, + msi_nr_vectors_allocated(pci_dev)); + vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ, + &vfu_msi_irq_state); + } + + if (ret < 0) { + return ret; + } + + vfu_object_setup_msi_cbs(o); + + pci_dev->irq_opaque = vfu_ctx; + + return 0; +} + +void vfu_object_set_bus_irq(PCIBus *pci_bus) +{ + int bus_num = pci_bus_num(pci_bus); + int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1); + + pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus, + max_bdf); +} + +static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + + /* vfu_object_ctx_run() handles lost connection */ + if (type == VFU_RESET_LOST_CONN) { + return 0; + } + + qdev_reset_all(DEVICE(o->pci_dev)); + + return 0; +} + +/* + * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device' + * properties. It also depends on devices instantiated in QEMU. These + * dependencies are not available during the instance_init phase of this + * object's life-cycle. As such, the server is initialized after the + * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT + * when the machine is setup, and the dependencies are available. + */ +static void vfu_object_machine_done(Notifier *notifier, void *data) +{ + VfuObject *o = container_of(notifier, VfuObject, machine_done); + Error *err = NULL; + + vfu_object_init_ctx(o, &err); + + if (err) { + error_propagate(&error_abort, err); + } +} + +/** + * vfu_object_init_ctx: Create and initialize libvfio-user context. Add + * an unplug blocker for the associated PCI device. Setup a FD handler + * to process incoming messages in the context's socket. + * + * The socket and device properties are mandatory, and this function + * will not create the context without them - the setters for these + * properties should call this function when the property is set. The + * machine should also be ready when this function is invoked - it is + * because QEMU objects are initialized before devices, and the + * associated PCI device wouldn't be available at the object + * initialization time. Until these conditions are satisfied, this + * function would return early without performing any task. + */ +static void vfu_object_init_ctx(VfuObject *o, Error **errp) +{ + ERRP_GUARD(); + DeviceState *dev = NULL; + vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL; + int ret; + + if (o->vfu_ctx || !o->socket || !o->device || + !phase_check(PHASE_MACHINE_READY)) { + return; + } + + if (o->err) { + error_propagate(errp, o->err); + o->err = NULL; + return; + } + + o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path, + LIBVFIO_USER_FLAG_ATTACH_NB, + o, VFU_DEV_TYPE_PCI); + if (o->vfu_ctx == NULL) { + error_setg(errp, "vfu: Failed to create context - %s", strerror(errno)); + return; + } + + dev = qdev_find_recursive(sysbus_get_default(), o->device); + if (dev == NULL) { + error_setg(errp, "vfu: Device %s not found", o->device); + goto fail; + } + + if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + error_setg(errp, "vfu: %s not a PCI device", o->device); + goto fail; + } + + o->pci_dev = PCI_DEVICE(dev); + + object_ref(OBJECT(o->pci_dev)); + + if (pci_is_express(o->pci_dev)) { + pci_type = VFU_PCI_TYPE_EXPRESS; + } + + ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0); + if (ret < 0) { + error_setg(errp, + "vfu: Failed to attach PCI device %s to context - %s", + o->device, strerror(errno)); + goto fail; + } + + error_setg(&o->unplug_blocker, + "vfu: %s for %s must be deleted before unplugging", + TYPE_VFU_OBJECT, o->device); + qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker); + + ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, + pci_config_size(o->pci_dev), &vfu_object_cfg_access, + VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB, + NULL, 0, -1, 0); + if (ret < 0) { + error_setg(errp, + "vfu: Failed to setup config space handlers for %s- %s", + o->device, strerror(errno)); + goto fail; + } + + ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister); + if (ret < 0) { + error_setg(errp, "vfu: Failed to setup DMA handlers for %s", + o->device); + goto fail; + } + + vfu_object_register_bars(o->vfu_ctx, o->pci_dev); + + ret = vfu_object_setup_irqs(o, o->pci_dev); + if (ret < 0) { + error_setg(errp, "vfu: Failed to setup interrupts for %s", + o->device); + goto fail; + } + + ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset); + if (ret < 0) { + error_setg(errp, "vfu: Failed to setup reset callback"); + goto fail; + } + + ret = vfu_realize_ctx(o->vfu_ctx); + if (ret < 0) { + error_setg(errp, "vfu: Failed to realize device %s- %s", + o->device, strerror(errno)); + goto fail; + } + + o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx); + if (o->vfu_poll_fd < 0) { + error_setg(errp, "vfu: Failed to get poll fd %s", o->device); + goto fail; + } + + qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o); + + return; + +fail: + vfu_destroy_ctx(o->vfu_ctx); + if (o->unplug_blocker && o->pci_dev) { + qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker); + error_free(o->unplug_blocker); + o->unplug_blocker = NULL; + } + if (o->pci_dev) { + vfu_object_restore_msi_cbs(o); + o->pci_dev->irq_opaque = NULL; + object_unref(OBJECT(o->pci_dev)); + o->pci_dev = NULL; + } + o->vfu_ctx = NULL; +} + +static void vfu_object_init(Object *obj) +{ + VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj); + VfuObject *o = VFU_OBJECT(obj); + + k->nr_devs++; + + if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) { + error_setg(&o->err, "vfu: %s only compatible with %s machine", + TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE); + return; + } + + if (!phase_check(PHASE_MACHINE_READY)) { + o->machine_done.notify = vfu_object_machine_done; + qemu_add_machine_init_done_notifier(&o->machine_done); + } + + o->vfu_poll_fd = -1; +} + +static void vfu_object_finalize(Object *obj) +{ + VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj); + VfuObject *o = VFU_OBJECT(obj); + + k->nr_devs--; + + qapi_free_SocketAddress(o->socket); + + o->socket = NULL; + + if (o->vfu_poll_fd != -1) { + qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL); + o->vfu_poll_fd = -1; + } + + if (o->vfu_ctx) { + vfu_destroy_ctx(o->vfu_ctx); + o->vfu_ctx = NULL; + } + + g_free(o->device); + + o->device = NULL; + + if (o->unplug_blocker && o->pci_dev) { + qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker); + error_free(o->unplug_blocker); + o->unplug_blocker = NULL; + } + + if (o->pci_dev) { + vfu_object_restore_msi_cbs(o); + o->pci_dev->irq_opaque = NULL; + object_unref(OBJECT(o->pci_dev)); + o->pci_dev = NULL; + } + + if (!k->nr_devs && vfu_object_auto_shutdown()) { + qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); + } + + if (o->machine_done.notify) { + qemu_remove_machine_init_done_notifier(&o->machine_done); + o->machine_done.notify = NULL; + } +} + +static void vfu_object_class_init(ObjectClass *klass, void *data) +{ + VfuObjectClass *k = VFU_OBJECT_CLASS(klass); + + k->nr_devs = 0; + + object_class_property_add(klass, "socket", "SocketAddress", NULL, + vfu_object_set_socket, NULL, NULL); + object_class_property_set_description(klass, "socket", + "SocketAddress " + "(ex: type=unix,path=/tmp/sock). " + "Only UNIX is presently supported"); + object_class_property_add_str(klass, "device", NULL, + vfu_object_set_device); + object_class_property_set_description(klass, "device", + "device ID - only PCI devices " + "are presently supported"); +} + +static const TypeInfo vfu_object_info = { + .name = TYPE_VFU_OBJECT, + .parent = TYPE_OBJECT, + .instance_size = sizeof(VfuObject), + .instance_init = vfu_object_init, + .instance_finalize = vfu_object_finalize, + .class_size = sizeof(VfuObjectClass), + .class_init = vfu_object_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void vfu_register_types(void) +{ + type_register_static(&vfu_object_info); +} + +type_init(vfu_register_types); diff --git a/include/exec/memory.h b/include/exec/memory.h index f1c1945..a6a0f4d 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -2810,6 +2810,9 @@ MemTxResult address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr, const void *buf, hwaddr len); +int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr); +bool prepare_mmio_access(MemoryRegion *mr); + static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write) { if (is_write) { diff --git a/include/hw/pci/msi.h b/include/hw/pci/msi.h index 40876884..58aa576 100644 --- a/include/hw/pci/msi.h +++ b/include/hw/pci/msi.h @@ -43,6 +43,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector); void msi_send_message(PCIDevice *dev, MSIMessage msg); void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len); unsigned int msi_nr_vectors_allocated(const PCIDevice *dev); +void msi_set_mask(PCIDevice *dev, int vector, bool mask, Error **errp); static inline bool msi_present(const PCIDevice *dev) { diff --git a/include/hw/pci/msix.h b/include/hw/pci/msix.h index 4c4a60c..4f1cda0 100644 --- a/include/hw/pci/msix.h +++ b/include/hw/pci/msix.h @@ -36,6 +36,7 @@ void msix_clr_pending(PCIDevice *dev, int vector); int msix_vector_use(PCIDevice *dev, unsigned vector); void msix_vector_unuse(PCIDevice *dev, unsigned vector); void msix_unuse_all_vectors(PCIDevice *dev); +void msix_set_mask(PCIDevice *dev, int vector, bool mask, Error **errp); void msix_notify(PCIDevice *dev, unsigned vector); diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 44dacfa..b54b6ef 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -16,6 +16,7 @@ extern bool pci_available; #define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) #define PCI_FUNC(devfn) ((devfn) & 0x07) #define PCI_BUILD_BDF(bus, devfn) ((bus << 8) | (devfn)) +#define PCI_BDF_TO_DEVFN(x) ((x) & 0xff) #define PCI_BUS_MAX 256 #define PCI_DEVFN_MAX 256 #define PCI_SLOT_MAX 32 @@ -127,6 +128,10 @@ typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num, pcibus_t addr, pcibus_t size, int type); typedef void PCIUnregisterFunc(PCIDevice *pci_dev); +typedef void MSITriggerFunc(PCIDevice *dev, MSIMessage msg); +typedef MSIMessage MSIPrepareMessageFunc(PCIDevice *dev, unsigned vector); +typedef MSIMessage MSIxPrepareMessageFunc(PCIDevice *dev, unsigned vector); + typedef struct PCIIORegion { pcibus_t addr; /* current PCI mapping address. -1 means not mapped */ #define PCI_BAR_UNMAPPED (~(pcibus_t)0) @@ -329,6 +334,14 @@ struct PCIDevice { /* Space to store MSIX table & pending bit array */ uint8_t *msix_table; uint8_t *msix_pba; + + /* May be used by INTx or MSI during interrupt notification */ + void *irq_opaque; + + MSITriggerFunc *msi_trigger; + MSIPrepareMessageFunc *msi_prepare_message; + MSIxPrepareMessageFunc *msix_prepare_message; + /* MemoryRegion container for msix exclusive BAR setup */ MemoryRegion msix_exclusive_bar; /* Memory Regions for MSIX table and pending bit entries. */ diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 92c3d65..98774e2 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -193,6 +193,7 @@ struct DeviceState { int instance_id_alias; int alias_required_for_version; ResettableState reset; + GSList *unplug_blockers; }; struct DeviceListener { @@ -420,6 +421,34 @@ void qdev_machine_creation_done(void); bool qdev_machine_modified(void); /** + * qdev_add_unplug_blocker: Add an unplug blocker to a device + * + * @dev: Device to be blocked from unplug + * @reason: Reason for blocking + */ +void qdev_add_unplug_blocker(DeviceState *dev, Error *reason); + +/** + * qdev_del_unplug_blocker: Remove an unplug blocker from a device + * + * @dev: Device to be unblocked + * @reason: Pointer to the Error used with qdev_add_unplug_blocker. + * Used as a handle to lookup the blocker for deletion. + */ +void qdev_del_unplug_blocker(DeviceState *dev, Error *reason); + +/** + * qdev_unplug_blocked: Confirm if a device is blocked from unplug + * + * @dev: Device to be tested + * @reason: Returns one of the reasons why the device is blocked, + * if any + * + * Returns: true if device is blocked from unplug, false otherwise + */ +bool qdev_unplug_blocked(DeviceState *dev, Error **errp); + +/** * GpioPolarity: Polarity of a GPIO line * * GPIO lines use either positive (active-high) logic, diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h new file mode 100644 index 0000000..33b68a8 --- /dev/null +++ b/include/hw/remote/iommu.h @@ -0,0 +1,40 @@ +/** + * Copyright © 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef REMOTE_IOMMU_H +#define REMOTE_IOMMU_H + +#include "hw/pci/pci_bus.h" +#include "hw/pci/pci.h" + +#ifndef INT2VOIDP +#define INT2VOIDP(i) (void *)(uintptr_t)(i) +#endif + +typedef struct RemoteIommuElem { + MemoryRegion *mr; + + AddressSpace as; +} RemoteIommuElem; + +#define TYPE_REMOTE_IOMMU "x-remote-iommu" +OBJECT_DECLARE_SIMPLE_TYPE(RemoteIommu, REMOTE_IOMMU) + +struct RemoteIommu { + Object parent; + + GHashTable *elem_by_devfn; + + QemuMutex lock; +}; + +void remote_iommu_setup(PCIBus *pci_bus); + +void remote_iommu_unplug_dev(PCIDevice *pci_dev); + +#endif diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h index 2a2a33c..ac32fda 100644 --- a/include/hw/remote/machine.h +++ b/include/hw/remote/machine.h @@ -22,6 +22,10 @@ struct RemoteMachineState { RemotePCIHost *host; RemoteIOHubState iohub; + + bool vfio_user; + + bool auto_shutdown; }; /* Used to pass to co-routine device and ioc. */ diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-user-obj.h new file mode 100644 index 0000000..87ab78b --- /dev/null +++ b/include/hw/remote/vfio-user-obj.h @@ -0,0 +1,6 @@ +#ifndef VFIO_USER_OBJ_H +#define VFIO_USER_OBJ_H + +void vfu_object_set_bus_irq(PCIBus *pci_bus); + +#endif diff --git a/meson.build b/meson.build index 0c2e11f..ca19ddc 100644 --- a/meson.build +++ b/meson.build @@ -308,6 +308,10 @@ multiprocess_allowed = get_option('multiprocess') \ .require(targetos == 'linux', error_message: 'Multiprocess QEMU is supported only on Linux') \ .allowed() +vfio_user_server_allowed = get_option('vfio_user_server') \ + .require(targetos == 'linux', error_message: 'vfio-user server is supported only on Linux') \ + .allowed() + have_tpm = get_option('tpm') \ .require(targetos != 'windows', error_message: 'TPM emulation only available on POSIX systems') \ .allowed() @@ -1752,6 +1756,7 @@ config_host_data.set('CONFIG_LIBNFS', libnfs.found()) config_host_data.set('CONFIG_LIBSSH', libssh.found()) config_host_data.set('CONFIG_LINUX_AIO', libaio.found()) config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found()) +config_host_data.set('CONFIG_LIBURING_REGISTER_RING_FD', cc.has_function('io_uring_register_ring_fd', prefix: '#include <liburing.h>', dependencies:linux_io_uring)) config_host_data.set('CONFIG_LIBPMEM', libpmem.found()) config_host_data.set('CONFIG_NUMA', numa.found()) config_host_data.set('CONFIG_OPENGL', opengl.found()) @@ -2379,7 +2384,8 @@ host_kconfig = \ (have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \ ('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \ (have_pvrdma ? ['CONFIG_PVRDMA=y'] : []) + \ - (multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : []) + (multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : []) + \ + (vfio_user_server_allowed ? ['CONFIG_VFIO_USER_SERVER_ALLOWED=y'] : []) ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ] @@ -2671,6 +2677,21 @@ if have_system endif endif +libvfio_user_dep = not_found +if have_system and vfio_user_server_allowed + have_internal = fs.exists(meson.current_source_dir() / 'subprojects/libvfio-user/meson.build') + + if not have_internal + error('libvfio-user source not found - please pull git submodule') + endif + + libvfio_user_proj = subproject('libvfio-user') + + libvfio_user_lib = libvfio_user_proj.get_variable('libvfio_user_dep') + + libvfio_user_dep = declare_dependency(dependencies: [libvfio_user_lib]) +endif + fdt = not_found if have_system fdt_opt = get_option('fdt') @@ -3789,6 +3810,7 @@ summary_info += {'target list': ' '.join(target_dirs)} if have_system summary_info += {'default devices': get_option('default_devices')} summary_info += {'out of process emulation': multiprocess_allowed} + summary_info += {'vfio-user server': vfio_user_server_allowed} endif summary(summary_info, bool_yn: true, section: 'Targets and accelerators') diff --git a/meson_options.txt b/meson_options.txt index 0e81973..f3e2f22 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -88,6 +88,8 @@ option('cfi_debug', type: 'boolean', value: 'false', description: 'Verbose errors in case of CFI violation') option('multiprocess', type: 'feature', value: 'auto', description: 'Out of process device emulation support') +option('vfio_user_server', type: 'feature', value: 'disabled', + description: 'vfio-user server support') option('dbus_display', type: 'feature', value: 'auto', description: '-display dbus support') option('tpm', type : 'feature', value : 'auto', diff --git a/qapi/misc.json b/qapi/misc.json index 4534448..27ef5a2 100644 --- a/qapi/misc.json +++ b/qapi/misc.json @@ -553,3 +553,34 @@ ## { 'event': 'RTC_CHANGE', 'data': { 'offset': 'int', 'qom-path': 'str' } } + +## +# @VFU_CLIENT_HANGUP: +# +# Emitted when the client of a TYPE_VFIO_USER_SERVER closes the +# communication channel +# +# @vfu-id: ID of the TYPE_VFIO_USER_SERVER object. It is the last component +# of @vfu-qom-path referenced below +# +# @vfu-qom-path: path to the TYPE_VFIO_USER_SERVER object in the QOM tree +# +# @dev-id: ID of attached PCI device +# +# @dev-qom-path: path to attached PCI device in the QOM tree +# +# Since: 7.1 +# +# Example: +# +# <- { "event": "VFU_CLIENT_HANGUP", +# "data": { "vfu-id": "vfu1", +# "vfu-qom-path": "/objects/vfu1", +# "dev-id": "sas1", +# "dev-qom-path": "/machine/peripheral/sas1" }, +# "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } +# +## +{ 'event': 'VFU_CLIENT_HANGUP', + 'data': { 'vfu-id': 'str', 'vfu-qom-path': 'str', + 'dev-id': 'str', 'dev-qom-path': 'str' } } diff --git a/qapi/qom.json b/qapi/qom.json index 6a653c6..80dd419 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -735,6 +735,20 @@ 'data': { 'fd': 'str', 'devid': 'str' } } ## +# @VfioUserServerProperties: +# +# Properties for x-vfio-user-server objects. +# +# @socket: socket to be used by the libvfio-user library +# +# @device: the ID of the device to be emulated at the server +# +# Since: 7.1 +## +{ 'struct': 'VfioUserServerProperties', + 'data': { 'socket': 'SocketAddress', 'device': 'str' } } + +## # @RngProperties: # # Properties for objects of classes derived from rng. @@ -874,7 +888,8 @@ 'tls-creds-psk', 'tls-creds-x509', 'tls-cipher-suites', - { 'name': 'x-remote-object', 'features': [ 'unstable' ] } + { 'name': 'x-remote-object', 'features': [ 'unstable' ] }, + { 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] } ] } ## @@ -938,7 +953,8 @@ 'tls-creds-psk': 'TlsCredsPskProperties', 'tls-creds-x509': 'TlsCredsX509Properties', 'tls-cipher-suites': 'TlsCredsProperties', - 'x-remote-object': 'RemoteObjectProperties' + 'x-remote-object': 'RemoteObjectProperties', + 'x-vfio-user-server': 'VfioUserServerProperties' } } ## diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 1fc1d2e..24eb5f3 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -153,6 +153,8 @@ meson_options_help() { printf "%s\n" ' usb-redir libusbredir support' printf "%s\n" ' vde vde network backend support' printf "%s\n" ' vdi vdi image format support' + printf "%s\n" ' vfio-user-server' + printf "%s\n" ' vfio-user server support' printf "%s\n" ' vhost-crypto vhost-user crypto backend support' printf "%s\n" ' vhost-kernel vhost kernel backend support' printf "%s\n" ' vhost-net vhost-net kernel acceleration support' @@ -415,6 +417,8 @@ _meson_option_parse() { --disable-vde) printf "%s" -Dvde=disabled ;; --enable-vdi) printf "%s" -Dvdi=enabled ;; --disable-vdi) printf "%s" -Dvdi=disabled ;; + --enable-vfio-user-server) printf "%s" -Dvfio_user_server=enabled ;; + --disable-vfio-user-server) printf "%s" -Dvfio_user_server=disabled ;; --enable-vhost-crypto) printf "%s" -Dvhost_crypto=enabled ;; --disable-vhost-crypto) printf "%s" -Dvhost_crypto=disabled ;; --enable-vhost-kernel) printf "%s" -Dvhost_kernel=enabled ;; diff --git a/softmmu/physmem.c b/softmmu/physmem.c index 657841e..fb16be5 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -2719,7 +2719,7 @@ void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size) invalidate_and_set_dirty(mr, addr, size); } -static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) +int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) { unsigned access_size_max = mr->ops->valid.max_access_size; @@ -2746,7 +2746,7 @@ static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) return l; } -static bool prepare_mmio_access(MemoryRegion *mr) +bool prepare_mmio_access(MemoryRegion *mr) { bool release_lock = false; diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c index bb5897f..4b0ef65 100644 --- a/softmmu/qdev-monitor.c +++ b/softmmu/qdev-monitor.c @@ -899,6 +899,10 @@ void qdev_unplug(DeviceState *dev, Error **errp) HotplugHandlerClass *hdc; Error *local_err = NULL; + if (qdev_unplug_blocked(dev, errp)) { + return; + } + if (dev->parent_bus && !qbus_is_hotpluggable(dev->parent_bus)) { error_setg(errp, QERR_BUS_NO_HOTPLUG, dev->parent_bus->name); return; diff --git a/stubs/meson.build b/stubs/meson.build index 6f80fec..d8f3fd5 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -60,3 +60,4 @@ if have_system else stub_ss.add(files('qdev.c')) endif +stub_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_false: files('vfio-user-obj.c')) diff --git a/stubs/vfio-user-obj.c b/stubs/vfio-user-obj.c new file mode 100644 index 0000000..79100d7 --- /dev/null +++ b/stubs/vfio-user-obj.c @@ -0,0 +1,6 @@ +#include "qemu/osdep.h" +#include "hw/remote/vfio-user-obj.h" + +void vfu_object_set_bus_irq(PCIBus *pci_bus) +{ +} diff --git a/subprojects/libvfio-user b/subprojects/libvfio-user new file mode 160000 +Subproject 0b28d205572c80b568a1003db2c8f37ca333e4d diff --git a/tests/docker/dockerfiles/centos8.docker b/tests/docker/dockerfiles/centos8.docker index 4b20925..10618bf 100644 --- a/tests/docker/dockerfiles/centos8.docker +++ b/tests/docker/dockerfiles/centos8.docker @@ -51,6 +51,7 @@ RUN dnf update -y && \ libbpf-devel \ libcacard-devel \ libcap-ng-devel \ + libcmocka-devel \ libcurl-devel \ libdrm-devel \ libepoxy-devel \ @@ -59,6 +60,7 @@ RUN dnf update -y && \ libgcrypt-devel \ libiscsi-devel \ libjpeg-devel \ + json-c-devel \ libnfs-devel \ libpmem-devel \ libpng-devel \ diff --git a/tests/qtest/fuzz/generic_fuzz.c b/tests/qtest/fuzz/generic_fuzz.c index 25df19f..447ffe8 100644 --- a/tests/qtest/fuzz/generic_fuzz.c +++ b/tests/qtest/fuzz/generic_fuzz.c @@ -144,7 +144,7 @@ static void *pattern_alloc(pattern p, size_t len) return buf; } -static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) +static int fuzz_memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) { unsigned access_size_max = mr->ops->valid.max_access_size; @@ -242,11 +242,12 @@ void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr) /* * If mr1 isn't RAM, address_space_translate doesn't update l. Use - * memory_access_size to identify the number of bytes that it is safe - * to write without accidentally writing to another MemoryRegion. + * fuzz_memory_access_size to identify the number of bytes that it + * is safe to write without accidentally writing to another + * MemoryRegion. */ if (!memory_region_is_ram(mr1)) { - l = memory_access_size(mr1, l, addr1); + l = fuzz_memory_access_size(mr1, l, addr1); } if (memory_region_is_ram(mr1) || memory_region_is_romd(mr1) || |