aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2022-06-15 09:47:24 -0700
committerRichard Henderson <richard.henderson@linaro.org>2022-06-15 09:47:24 -0700
commit9ac873a46963098441be920ef7a2eaf244a3352d (patch)
tree86bb7301ad0f534513a53984d0d3c17a1740f453
parent8e6c70b9d4a1b1f3011805947925cfdb31642f7f (diff)
parent99b969fbe105117f5af6060d3afef40ca39cc9c1 (diff)
downloadqemu-9ac873a46963098441be920ef7a2eaf244a3352d.zip
qemu-9ac873a46963098441be920ef7a2eaf244a3352d.tar.gz
qemu-9ac873a46963098441be920ef7a2eaf244a3352d.tar.bz2
Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into staging
Pull request This pull request includes an important aio=native I/O stall fix, the experimental vifo-user server, the io_uring_register_ring_fd() optimization for aio=io_uring, and an update to Vladimir Sementsov-Ogievskiy's maintainership details. # -----BEGIN PGP SIGNATURE----- # # iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAmKp/+AACgkQnKSrs4Gr # c8gg9wf/ZG1+eGR2NA0T1szlhtgy2bnp95hrLbKzP7tVxueFq7QCcsIsLGWqfnMd # RREUi6Tgx1v7Agk2oIyUcrjn5rt4LPVOKolVbK6e5Pyou2/Sf/ApkhRjRnzzfACE # J56H8gPU7fS4/8sJYCYGlWEr7pMmJMVJFPl2tNsErPwuZMSjo27n6UqDE/ZSZF1p # w1a+cwo+6YSjtJg4AFB/+izBam4+U6w1YhgZM6p6hx5a7GLoq/w59W6Yb119GANO # tg5qzmSHtMKTieORJmYAt83T1xS5d/iyca4w1PiYQxJsHsqwAaPpoyEhgGT+u+CA # hfb3HDdQCFyVKwlKD5H1a+WD/Hr11w== # =zcl8 # -----END PGP SIGNATURE----- # gpg: Signature made Wed 15 Jun 2022 08:50:56 AM PDT # gpg: using RSA key 8695A8BFD3F97CDAAC35775A9CA4ABB381AB73C8 # gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" [full] # gpg: aka "Stefan Hajnoczi <stefanha@gmail.com>" [full] * tag 'block-pull-request' of https://gitlab.com/stefanha/qemu: linux-aio: explain why max batch is checked in laio_io_unplug() linux-aio: fix unbalanced plugged counter in laio_io_unplug() vfio-user: handle reset of remote device vfio-user: handle device interrupts vfio-user: handle PCI BAR accesses vfio-user: handle DMA mappings vfio-user: IOMMU support for remote device vfio-user: handle PCI config space accesses vfio-user: run vfio-user context vfio-user: find and init PCI device vfio-user: instantiate vfio-user context vfio-user: define vfio-user-server object vfio-user: build library remote/machine: add vfio-user property remote/machine: add HotplugHandler for remote machine qdev: unplug blocker for devices Use io_uring_register_ring_fd() to skip fd operations MAINTAINERS: update Vladimir's address and repositories Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r--.gitlab-ci.d/buildtest.yml1
-rw-r--r--.gitmodules3
-rw-r--r--Kconfig.host4
-rw-r--r--MAINTAINERS27
-rw-r--r--block/io_uring.c12
-rw-r--r--block/linux-aio.c10
-rwxr-xr-xconfigure17
-rw-r--r--hw/core/qdev.c24
-rw-r--r--hw/pci/msi.c49
-rw-r--r--hw/pci/msix.c35
-rw-r--r--hw/pci/pci.c13
-rw-r--r--hw/remote/Kconfig4
-rw-r--r--hw/remote/iommu.c131
-rw-r--r--hw/remote/machine.c88
-rw-r--r--hw/remote/meson.build4
-rw-r--r--hw/remote/trace-events11
-rw-r--r--hw/remote/vfio-user-obj.c958
-rw-r--r--include/exec/memory.h3
-rw-r--r--include/hw/pci/msi.h1
-rw-r--r--include/hw/pci/msix.h1
-rw-r--r--include/hw/pci/pci.h13
-rw-r--r--include/hw/qdev-core.h29
-rw-r--r--include/hw/remote/iommu.h40
-rw-r--r--include/hw/remote/machine.h4
-rw-r--r--include/hw/remote/vfio-user-obj.h6
-rw-r--r--meson.build24
-rw-r--r--meson_options.txt2
-rw-r--r--qapi/misc.json31
-rw-r--r--qapi/qom.json20
-rw-r--r--scripts/meson-buildoptions.sh4
-rw-r--r--softmmu/physmem.c4
-rw-r--r--softmmu/qdev-monitor.c4
-rw-r--r--stubs/meson.build1
-rw-r--r--stubs/vfio-user-obj.c6
m---------subprojects/libvfio-user0
-rw-r--r--tests/docker/dockerfiles/centos8.docker2
-rw-r--r--tests/qtest/fuzz/generic_fuzz.c9
37 files changed, 1564 insertions, 31 deletions
diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml
index cb7cad4..8a4353e 100644
--- a/.gitlab-ci.d/buildtest.yml
+++ b/.gitlab-ci.d/buildtest.yml
@@ -168,6 +168,7 @@ build-system-centos:
IMAGE: centos8
CONFIGURE_ARGS: --disable-nettle --enable-gcrypt --enable-fdt=system
--enable-modules --enable-trace-backends=dtrace --enable-docs
+ --enable-vfio-user-server
TARGETS: ppc64-softmmu or1k-softmmu s390x-softmmu
x86_64-softmmu rx-softmmu sh4-softmmu nios2-softmmu
MAKE_CHECK_ARGS: check-build
diff --git a/.gitmodules b/.gitmodules
index b8bff47..aedd9a0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -64,3 +64,6 @@
[submodule "tests/lcitool/libvirt-ci"]
path = tests/lcitool/libvirt-ci
url = https://gitlab.com/libvirt/libvirt-ci.git
+[submodule "subprojects/libvfio-user"]
+ path = subprojects/libvfio-user
+ url = https://gitlab.com/qemu-project/libvfio-user.git
diff --git a/Kconfig.host b/Kconfig.host
index 1165c4e..d763d89 100644
--- a/Kconfig.host
+++ b/Kconfig.host
@@ -42,3 +42,7 @@ config MULTIPROCESS_ALLOWED
config FUZZ
bool
select SPARSE_MEM
+
+config VFIO_USER_SERVER_ALLOWED
+ bool
+ imply VFIO_USER_SERVER
diff --git a/MAINTAINERS b/MAINTAINERS
index 4cf6174..aaa649a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2546,7 +2546,7 @@ F: scsi/*
Block Jobs
M: John Snow <jsnow@redhat.com>
-M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
+M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
L: qemu-block@nongnu.org
S: Supported
F: blockjob.c
@@ -2571,7 +2571,7 @@ F: block/aio_task.c
F: util/qemu-co-shared-resource.c
F: include/qemu/co-shared-resource.h
T: git https://gitlab.com/jsnow/qemu.git jobs
-T: git https://src.openvz.org/scm/~vsementsov/qemu.git jobs
+T: git https://gitlab.com/vsementsov/qemu.git block
Block QAPI, monitor, command line
M: Markus Armbruster <armbru@redhat.com>
@@ -2592,7 +2592,7 @@ F: include/hw/cxl/
Dirty Bitmaps
M: Eric Blake <eblake@redhat.com>
-M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
+M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
R: John Snow <jsnow@redhat.com>
L: qemu-block@nongnu.org
S: Supported
@@ -2606,6 +2606,7 @@ F: util/hbitmap.c
F: tests/unit/test-hbitmap.c
F: docs/interop/bitmaps.rst
T: git https://repo.or.cz/qemu/ericb.git bitmaps
+T: git https://gitlab.com/vsementsov/qemu.git block
Character device backends
M: Marc-André Lureau <marcandre.lureau@redhat.com>
@@ -2816,16 +2817,17 @@ F: scripts/*.py
F: tests/*.py
Benchmark util
-M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
+M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
S: Maintained
F: scripts/simplebench/
-T: git https://src.openvz.org/scm/~vsementsov/qemu.git simplebench
+T: git https://gitlab.com/vsementsov/qemu.git simplebench
Transactions helper
-M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
+M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
S: Maintained
F: include/qemu/transactions.h
F: util/transactions.c
+T: git https://gitlab.com/vsementsov/qemu.git block
QAPI
M: Markus Armbruster <armbru@redhat.com>
@@ -3402,7 +3404,7 @@ F: block/iscsi-opts.c
Network Block Device (NBD)
M: Eric Blake <eblake@redhat.com>
-M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
+M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
L: qemu-block@nongnu.org
S: Maintained
F: block/nbd*
@@ -3414,7 +3416,7 @@ F: docs/interop/nbd.txt
F: docs/tools/qemu-nbd.rst
F: tests/qemu-iotests/tests/*nbd*
T: git https://repo.or.cz/qemu/ericb.git nbd
-T: git https://src.openvz.org/scm/~vsementsov/qemu.git nbd
+T: git https://gitlab.com/vsementsov/qemu.git block
NFS
M: Peter Lieven <pl@kamp.de>
@@ -3499,13 +3501,13 @@ F: block/dmg.c
parallels
M: Stefan Hajnoczi <stefanha@redhat.com>
M: Denis V. Lunev <den@openvz.org>
-M: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
+M: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
L: qemu-block@nongnu.org
S: Supported
F: block/parallels.c
F: block/parallels-ext.c
F: docs/interop/parallels.txt
-T: git https://src.openvz.org/scm/~vsementsov/qemu.git parallels
+T: git https://gitlab.com/vsementsov/qemu.git block
qed
M: Stefan Hajnoczi <stefanha@redhat.com>
@@ -3640,6 +3642,11 @@ F: hw/remote/proxy-memory-listener.c
F: include/hw/remote/proxy-memory-listener.h
F: hw/remote/iohub.c
F: include/hw/remote/iohub.h
+F: subprojects/libvfio-user
+F: hw/remote/vfio-user-obj.c
+F: include/hw/remote/vfio-user-obj.h
+F: hw/remote/iommu.c
+F: include/hw/remote/iommu.h
EBPF:
M: Jason Wang <jasowang@redhat.com>
diff --git a/block/io_uring.c b/block/io_uring.c
index 0b40151..d48e472 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -18,6 +18,7 @@
#include "qapi/error.h"
#include "trace.h"
+
/* io_uring ring size */
#define MAX_ENTRIES 128
@@ -434,8 +435,17 @@ LuringState *luring_init(Error **errp)
}
ioq_init(&s->io_q);
- return s;
+#ifdef CONFIG_LIBURING_REGISTER_RING_FD
+ if (io_uring_register_ring_fd(&s->ring) < 0) {
+ /*
+ * Only warn about this error: we will fallback to the non-optimized
+ * io_uring operations.
+ */
+ warn_report("failed to register linux io_uring ring file descriptor");
+ }
+#endif
+ return s;
}
void luring_cleanup(LuringState *s)
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 4c423fc..9c2393a 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -363,8 +363,16 @@ void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
uint64_t dev_max_batch)
{
assert(s->io_q.plugged);
+ s->io_q.plugged--;
+
+ /*
+ * Why max batch checking is performed here:
+ * Another BDS may have queued requests with a higher dev_max_batch and
+ * therefore in_queue could now exceed our dev_max_batch. Re-check the max
+ * batch so we can honor our device's dev_max_batch.
+ */
if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
- (--s->io_q.plugged == 0 &&
+ (!s->io_q.plugged &&
!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
ioq_submit(s);
}
diff --git a/configure b/configure
index 4b12a80..c14e7f5 100755
--- a/configure
+++ b/configure
@@ -315,6 +315,7 @@ meson_args=""
ninja=""
bindir="bin"
skip_meson=no
+vfio_user_server="disabled"
# The following Meson options are handled manually (still they
# are included in the automatically generated help message)
@@ -909,6 +910,10 @@ for opt do
;;
--disable-blobs) meson_option_parse --disable-install-blobs ""
;;
+ --enable-vfio-user-server) vfio_user_server="enabled"
+ ;;
+ --disable-vfio-user-server) vfio_user_server="disabled"
+ ;;
--enable-tcmalloc) meson_option_parse --enable-malloc=tcmalloc tcmalloc
;;
--enable-jemalloc) meson_option_parse --enable-malloc=jemalloc jemalloc
@@ -2133,6 +2138,17 @@ write_container_target_makefile() {
##########################################
+# check for vfio_user_server
+
+case "$vfio_user_server" in
+ enabled )
+ if test "$git_submodules_action" != "ignore"; then
+ git_submodules="${git_submodules} subprojects/libvfio-user"
+ fi
+ ;;
+esac
+
+##########################################
# End of CC checks
# After here, no more $cc or $ld runs
@@ -2672,6 +2688,7 @@ if test "$skip_meson" = no; then
test "$slirp" != auto && meson_option_add "-Dslirp=$slirp"
test "$smbd" != '' && meson_option_add "-Dsmbd=$smbd"
test "$tcg" != enabled && meson_option_add "-Dtcg=$tcg"
+ test "$vfio_user_server" != auto && meson_option_add "-Dvfio_user_server=$vfio_user_server"
run_meson() {
NINJA=$ninja $meson setup --prefix "$prefix" "$@" $cross_arg "$PWD" "$source_path"
}
diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index 84f3019..0806d8f 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -468,6 +468,28 @@ char *qdev_get_dev_path(DeviceState *dev)
return NULL;
}
+void qdev_add_unplug_blocker(DeviceState *dev, Error *reason)
+{
+ dev->unplug_blockers = g_slist_prepend(dev->unplug_blockers, reason);
+}
+
+void qdev_del_unplug_blocker(DeviceState *dev, Error *reason)
+{
+ dev->unplug_blockers = g_slist_remove(dev->unplug_blockers, reason);
+}
+
+bool qdev_unplug_blocked(DeviceState *dev, Error **errp)
+{
+ ERRP_GUARD();
+
+ if (dev->unplug_blockers) {
+ error_propagate(errp, error_copy(dev->unplug_blockers->data));
+ return true;
+ }
+
+ return false;
+}
+
static bool device_get_realized(Object *obj, Error **errp)
{
DeviceState *dev = DEVICE(obj);
@@ -704,6 +726,8 @@ static void device_finalize(Object *obj)
DeviceState *dev = DEVICE(obj);
+ g_assert(!dev->unplug_blockers);
+
QLIST_FOREACH_SAFE(ngl, &dev->gpios, node, next) {
QLIST_REMOVE(ngl, node);
qemu_free_irqs(ngl->in, ngl->num_in);
diff --git a/hw/pci/msi.c b/hw/pci/msi.c
index 47d2b0f..5c471b9 100644
--- a/hw/pci/msi.c
+++ b/hw/pci/msi.c
@@ -134,7 +134,7 @@ void msi_set_message(PCIDevice *dev, MSIMessage msg)
pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
}
-MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector)
+static MSIMessage msi_prepare_message(PCIDevice *dev, unsigned int vector)
{
uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
@@ -159,6 +159,11 @@ MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector)
return msg;
}
+MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector)
+{
+ return dev->msi_prepare_message(dev, vector);
+}
+
bool msi_enabled(const PCIDevice *dev)
{
return msi_present(dev) &&
@@ -241,6 +246,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
0xffffffff >> (PCI_MSI_VECTORS_MAX - nr_vectors));
}
+ dev->msi_prepare_message = msi_prepare_message;
+
return 0;
}
@@ -256,6 +263,7 @@ void msi_uninit(struct PCIDevice *dev)
cap_size = msi_cap_sizeof(flags);
pci_del_capability(dev, PCI_CAP_ID_MSI, cap_size);
dev->cap_present &= ~QEMU_PCI_CAP_MSI;
+ dev->msi_prepare_message = NULL;
MSI_DEV_PRINTF(dev, "uninit\n");
}
@@ -307,6 +315,39 @@ bool msi_is_masked(const PCIDevice *dev, unsigned int vector)
return mask & (1U << vector);
}
+void msi_set_mask(PCIDevice *dev, int vector, bool mask, Error **errp)
+{
+ ERRP_GUARD();
+ uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
+ bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+ uint32_t irq_state, vector_mask, pending;
+
+ if (vector > PCI_MSI_VECTORS_MAX) {
+ error_setg(errp, "msi: vector %d not allocated. max vector is %d",
+ vector, PCI_MSI_VECTORS_MAX);
+ return;
+ }
+
+ vector_mask = (1U << vector);
+
+ irq_state = pci_get_long(dev->config + msi_mask_off(dev, msi64bit));
+
+ if (mask) {
+ irq_state |= vector_mask;
+ } else {
+ irq_state &= ~vector_mask;
+ }
+
+ pci_set_long(dev->config + msi_mask_off(dev, msi64bit), irq_state);
+
+ pending = pci_get_long(dev->config + msi_pending_off(dev, msi64bit));
+ if (!mask && (pending & vector_mask)) {
+ pending &= ~vector_mask;
+ pci_set_long(dev->config + msi_pending_off(dev, msi64bit), pending);
+ msi_notify(dev, vector);
+ }
+}
+
void msi_notify(PCIDevice *dev, unsigned int vector)
{
uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
@@ -334,11 +375,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
void msi_send_message(PCIDevice *dev, MSIMessage msg)
{
- MemTxAttrs attrs = {};
-
- attrs.requester_id = pci_requester_id(dev);
- address_space_stl_le(&dev->bus_master_as, msg.address, msg.data,
- attrs, NULL);
+ dev->msi_trigger(dev, msg);
}
/* Normally called by pci_default_write_config(). */
diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index ae9331c..1e381a9 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -31,7 +31,7 @@
#define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
#define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)
-MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
+static MSIMessage msix_prepare_message(PCIDevice *dev, unsigned vector)
{
uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
MSIMessage msg;
@@ -41,6 +41,11 @@ MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
return msg;
}
+MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
+{
+ return dev->msix_prepare_message(dev, vector);
+}
+
/*
* Special API for POWER to configure the vectors through
* a side channel. Should never be used by devices.
@@ -131,6 +136,31 @@ static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked)
}
}
+void msix_set_mask(PCIDevice *dev, int vector, bool mask, Error **errp)
+{
+ ERRP_GUARD();
+ unsigned offset;
+ bool was_masked;
+
+ if (vector > dev->msix_entries_nr) {
+ error_setg(errp, "msix: vector %d not allocated. max vector is %d",
+ vector, dev->msix_entries_nr);
+ return;
+ }
+
+ offset = vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
+
+ was_masked = msix_is_masked(dev, vector);
+
+ if (mask) {
+ dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
+ } else {
+ dev->msix_table[offset] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
+ }
+
+ msix_handle_mask_update(dev, vector, was_masked);
+}
+
static bool msix_masked(PCIDevice *dev)
{
return dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK;
@@ -344,6 +374,8 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
"msix-pba", pba_size);
memory_region_add_subregion(pba_bar, pba_offset, &dev->msix_pba_mmio);
+ dev->msix_prepare_message = msix_prepare_message;
+
return 0;
}
@@ -429,6 +461,7 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
g_free(dev->msix_entry_used);
dev->msix_entry_used = NULL;
dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
+ dev->msix_prepare_message = NULL;
}
void msix_uninit_exclusive_bar(PCIDevice *dev)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 6e70153..2f450f6 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -317,6 +317,15 @@ void pci_device_deassert_intx(PCIDevice *dev)
}
}
+static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg)
+{
+ MemTxAttrs attrs = {};
+
+ attrs.requester_id = pci_requester_id(dev);
+ address_space_stl_le(&dev->bus_master_as, msg.address, msg.data,
+ attrs, NULL);
+}
+
static void pci_reset_regions(PCIDevice *dev)
{
int r;
@@ -1212,6 +1221,8 @@ static void pci_qdev_unrealize(DeviceState *dev)
pci_device_deassert_intx(pci_dev);
do_pci_unregister_device(pci_dev);
+
+ pci_dev->msi_trigger = NULL;
}
void pci_register_bar(PCIDevice *pci_dev, int region_num,
@@ -2251,6 +2262,8 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
}
pci_set_power(pci_dev, true);
+
+ pci_dev->msi_trigger = pci_msi_trigger;
}
PCIDevice *pci_new_multifunction(int devfn, bool multifunction,
diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig
index 08c16e2..2d6b4f4 100644
--- a/hw/remote/Kconfig
+++ b/hw/remote/Kconfig
@@ -2,3 +2,7 @@ config MULTIPROCESS
bool
depends on PCI && PCI_EXPRESS && KVM
select REMOTE_PCIHOST
+
+config VFIO_USER_SERVER
+ bool
+ depends on MULTIPROCESS
diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
new file mode 100644
index 0000000..fd723d9
--- /dev/null
+++ b/hw/remote/iommu.c
@@ -0,0 +1,131 @@
+/**
+ * IOMMU for remote device
+ *
+ * Copyright © 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/remote/iommu.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
+#include "trace.h"
+
+/**
+ * IOMMU for TYPE_REMOTE_MACHINE - manages DMA address space isolation
+ * for remote machine. It is used by TYPE_VFIO_USER_SERVER.
+ *
+ * - Each TYPE_VFIO_USER_SERVER instance handles one PCIDevice on a PCIBus.
+ * There is one RemoteIommu per PCIBus, so the RemoteIommu tracks multiple
+ * PCIDevices by maintaining a ->elem_by_devfn mapping.
+ *
+ * - memory_region_init_iommu() is not used because vfio-user MemoryRegions
+ * will be added to the elem->mr container instead. This is more natural
+ * than implementing the IOMMUMemoryRegionClass APIs since vfio-user
+ * provides something that is close to a full-fledged MemoryRegion and
+ * not like an IOMMU mapping.
+ *
+ * - When a device is hot unplugged, the elem->mr reference is dropped so
+ * all vfio-user MemoryRegions associated with this vfio-user server are
+ * destroyed.
+ */
+
+static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
+ void *opaque, int devfn)
+{
+ RemoteIommu *iommu = opaque;
+ RemoteIommuElem *elem = NULL;
+
+ qemu_mutex_lock(&iommu->lock);
+
+ elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn));
+
+ if (!elem) {
+ elem = g_malloc0(sizeof(RemoteIommuElem));
+ g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem);
+ }
+
+ if (!elem->mr) {
+ elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION));
+ memory_region_set_size(elem->mr, UINT64_MAX);
+ address_space_init(&elem->as, elem->mr, NULL);
+ }
+
+ qemu_mutex_unlock(&iommu->lock);
+
+ return &elem->as;
+}
+
+void remote_iommu_unplug_dev(PCIDevice *pci_dev)
+{
+ AddressSpace *as = pci_device_iommu_address_space(pci_dev);
+ RemoteIommuElem *elem = NULL;
+
+ if (as == &address_space_memory) {
+ return;
+ }
+
+ elem = container_of(as, RemoteIommuElem, as);
+
+ address_space_destroy(&elem->as);
+
+ object_unref(elem->mr);
+
+ elem->mr = NULL;
+}
+
+static void remote_iommu_init(Object *obj)
+{
+ RemoteIommu *iommu = REMOTE_IOMMU(obj);
+
+ iommu->elem_by_devfn = g_hash_table_new_full(NULL, NULL, NULL, g_free);
+
+ qemu_mutex_init(&iommu->lock);
+}
+
+static void remote_iommu_finalize(Object *obj)
+{
+ RemoteIommu *iommu = REMOTE_IOMMU(obj);
+
+ qemu_mutex_destroy(&iommu->lock);
+
+ g_hash_table_destroy(iommu->elem_by_devfn);
+
+ iommu->elem_by_devfn = NULL;
+}
+
+void remote_iommu_setup(PCIBus *pci_bus)
+{
+ RemoteIommu *iommu = NULL;
+
+ g_assert(pci_bus);
+
+ iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU));
+
+ pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu);
+
+ object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu));
+
+ object_unref(OBJECT(iommu));
+}
+
+static const TypeInfo remote_iommu_info = {
+ .name = TYPE_REMOTE_IOMMU,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(RemoteIommu),
+ .instance_init = remote_iommu_init,
+ .instance_finalize = remote_iommu_finalize,
+};
+
+static void remote_iommu_register_types(void)
+{
+ type_register_static(&remote_iommu_info);
+}
+
+type_init(remote_iommu_register_types)
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
index 92d71d4..75d550d 100644
--- a/hw/remote/machine.c
+++ b/hw/remote/machine.c
@@ -20,6 +20,11 @@
#include "qapi/error.h"
#include "hw/pci/pci_host.h"
#include "hw/remote/iohub.h"
+#include "hw/remote/iommu.h"
+#include "hw/qdev-core.h"
+#include "hw/remote/iommu.h"
+#include "hw/remote/vfio-user-obj.h"
+#include "hw/pci/msi.h"
static void remote_machine_init(MachineState *machine)
{
@@ -49,25 +54,102 @@ static void remote_machine_init(MachineState *machine)
pci_host = PCI_HOST_BRIDGE(rem_host);
- remote_iohub_init(&s->iohub);
+ if (s->vfio_user) {
+ remote_iommu_setup(pci_host->bus);
- pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
- &s->iohub, REMOTE_IOHUB_NB_PIRQS);
+ msi_nonbroken = true;
+
+ vfu_object_set_bus_irq(pci_host->bus);
+ } else {
+ remote_iohub_init(&s->iohub);
+
+ pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
+ &s->iohub, REMOTE_IOHUB_NB_PIRQS);
+ }
+
+ qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s));
+}
+
+static bool remote_machine_get_vfio_user(Object *obj, Error **errp)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ return s->vfio_user;
+}
+
+static void remote_machine_set_vfio_user(Object *obj, bool value, Error **errp)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ if (phase_check(PHASE_MACHINE_CREATED)) {
+ error_setg(errp, "Error enabling vfio-user - machine already created");
+ return;
+ }
+
+ s->vfio_user = value;
+}
+
+static bool remote_machine_get_auto_shutdown(Object *obj, Error **errp)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ return s->auto_shutdown;
+}
+
+static void remote_machine_set_auto_shutdown(Object *obj, bool value,
+ Error **errp)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ s->auto_shutdown = value;
+}
+
+static void remote_machine_instance_init(Object *obj)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ s->auto_shutdown = true;
+}
+
+static void remote_machine_dev_unplug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ qdev_unrealize(dev);
+
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+ remote_iommu_unplug_dev(PCI_DEVICE(dev));
+ }
}
static void remote_machine_class_init(ObjectClass *oc, void *data)
{
MachineClass *mc = MACHINE_CLASS(oc);
+ HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
mc->init = remote_machine_init;
mc->desc = "Experimental remote machine";
+
+ hc->unplug = remote_machine_dev_unplug_cb;
+
+ object_class_property_add_bool(oc, "vfio-user",
+ remote_machine_get_vfio_user,
+ remote_machine_set_vfio_user);
+
+ object_class_property_add_bool(oc, "auto-shutdown",
+ remote_machine_get_auto_shutdown,
+ remote_machine_set_auto_shutdown);
}
static const TypeInfo remote_machine = {
.name = TYPE_REMOTE_MACHINE,
.parent = TYPE_MACHINE,
.instance_size = sizeof(RemoteMachineState),
+ .instance_init = remote_machine_instance_init,
.class_init = remote_machine_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_HOTPLUG_HANDLER },
+ { }
+ }
};
static void remote_machine_register_types(void)
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
index e6a5574..ab25c04 100644
--- a/hw/remote/meson.build
+++ b/hw/remote/meson.build
@@ -6,6 +6,10 @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c'))
+remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c'))
+
+remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: libvfio_user_dep)
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c'))
diff --git a/hw/remote/trace-events b/hw/remote/trace-events
index 0b23974..c167b3c 100644
--- a/hw/remote/trace-events
+++ b/hw/remote/trace-events
@@ -2,3 +2,14 @@
mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process"
mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process"
+
+# vfio-user-obj.c
+vfu_prop(const char *prop, const char *val) "vfu: setting %s as %s"
+vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u -> 0x%x"
+vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u <- 0x%x"
+vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes"
+vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
+vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
+vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
+vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
+vfu_interrupt(int pirq) "vfu: sending interrupt to device - PIRQ %d"
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
new file mode 100644
index 0000000..c6cc53a
--- /dev/null
+++ b/hw/remote/vfio-user-obj.c
@@ -0,0 +1,958 @@
+/**
+ * QEMU vfio-user-server server object
+ *
+ * Copyright © 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
+ *
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+/**
+ * Usage: add options:
+ * -machine x-remote,vfio-user=on,auto-shutdown=on
+ * -device <PCI-device>,id=<pci-dev-id>
+ * -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>,
+ * device=<pci-dev-id>
+ *
+ * Note that x-vfio-user-server object must be used with x-remote machine only.
+ * This server could only support PCI devices for now.
+ *
+ * type - SocketAddress type - presently "unix" alone is supported. Required
+ * option
+ *
+ * path - named unix socket, it will be created by the server. It is
+ * a required option
+ *
+ * device - id of a device on the server, a required option. PCI devices
+ * alone are supported presently.
+ *
+ * notes - x-vfio-user-server could block IO and monitor during the
+ * initialization phase.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qom/object.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "sysemu/runstate.h"
+#include "hw/boards.h"
+#include "hw/remote/machine.h"
+#include "qapi/error.h"
+#include "qapi/qapi-visit-sockets.h"
+#include "qapi/qapi-events-misc.h"
+#include "qemu/notify.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+#include "sysemu/sysemu.h"
+#include "libvfio-user.h"
+#include "hw/qdev-core.h"
+#include "hw/pci/pci.h"
+#include "qemu/timer.h"
+#include "exec/memory.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "hw/remote/vfio-user-obj.h"
+
+#define TYPE_VFU_OBJECT "x-vfio-user-server"
+OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
+
+/**
+ * VFU_OBJECT_ERROR - reports an error message. If auto_shutdown
+ * is set, it aborts the machine on error. Otherwise, it logs an
+ * error message without aborting.
+ */
+#define VFU_OBJECT_ERROR(o, fmt, ...) \
+ { \
+ if (vfu_object_auto_shutdown()) { \
+ error_setg(&error_abort, (fmt), ## __VA_ARGS__); \
+ } else { \
+ error_report((fmt), ## __VA_ARGS__); \
+ } \
+ } \
+
+struct VfuObjectClass {
+ ObjectClass parent_class;
+
+ unsigned int nr_devs;
+};
+
+struct VfuObject {
+ /* private */
+ Object parent;
+
+ SocketAddress *socket;
+
+ char *device;
+
+ Error *err;
+
+ Notifier machine_done;
+
+ vfu_ctx_t *vfu_ctx;
+
+ PCIDevice *pci_dev;
+
+ Error *unplug_blocker;
+
+ int vfu_poll_fd;
+
+ MSITriggerFunc *default_msi_trigger;
+ MSIPrepareMessageFunc *default_msi_prepare_message;
+ MSIxPrepareMessageFunc *default_msix_prepare_message;
+};
+
+static void vfu_object_init_ctx(VfuObject *o, Error **errp);
+
+static bool vfu_object_auto_shutdown(void)
+{
+ bool auto_shutdown = true;
+ Error *local_err = NULL;
+
+ if (!current_machine) {
+ return auto_shutdown;
+ }
+
+ auto_shutdown = object_property_get_bool(OBJECT(current_machine),
+ "auto-shutdown",
+ &local_err);
+
+ /*
+ * local_err would be set if no such property exists - safe to ignore.
+ * Unlikely scenario as auto-shutdown is always defined for
+ * TYPE_REMOTE_MACHINE, and TYPE_VFU_OBJECT only works with
+ * TYPE_REMOTE_MACHINE
+ */
+ if (local_err) {
+ auto_shutdown = true;
+ error_free(local_err);
+ }
+
+ return auto_shutdown;
+}
+
+static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ VfuObject *o = VFU_OBJECT(obj);
+
+ if (o->vfu_ctx) {
+ error_setg(errp, "vfu: Unable to set socket property - server busy");
+ return;
+ }
+
+ qapi_free_SocketAddress(o->socket);
+
+ o->socket = NULL;
+
+ visit_type_SocketAddress(v, name, &o->socket, errp);
+
+ if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
+ error_setg(errp, "vfu: Unsupported socket type - %s",
+ SocketAddressType_str(o->socket->type));
+ qapi_free_SocketAddress(o->socket);
+ o->socket = NULL;
+ return;
+ }
+
+ trace_vfu_prop("socket", o->socket->u.q_unix.path);
+
+ vfu_object_init_ctx(o, errp);
+}
+
+static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
+{
+ VfuObject *o = VFU_OBJECT(obj);
+
+ if (o->vfu_ctx) {
+ error_setg(errp, "vfu: Unable to set device property - server busy");
+ return;
+ }
+
+ g_free(o->device);
+
+ o->device = g_strdup(str);
+
+ trace_vfu_prop("device", str);
+
+ vfu_object_init_ctx(o, errp);
+}
+
+static void vfu_object_ctx_run(void *opaque)
+{
+ VfuObject *o = opaque;
+ const char *vfu_id;
+ char *vfu_path, *pci_dev_path;
+ int ret = -1;
+
+ while (ret != 0) {
+ ret = vfu_run_ctx(o->vfu_ctx);
+ if (ret < 0) {
+ if (errno == EINTR) {
+ continue;
+ } else if (errno == ENOTCONN) {
+ vfu_id = object_get_canonical_path_component(OBJECT(o));
+ vfu_path = object_get_canonical_path(OBJECT(o));
+ g_assert(o->pci_dev);
+ pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev));
+ /* o->device is a required property and is non-NULL here */
+ g_assert(o->device);
+ qapi_event_send_vfu_client_hangup(vfu_id, vfu_path,
+ o->device, pci_dev_path);
+ qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
+ o->vfu_poll_fd = -1;
+ object_unparent(OBJECT(o));
+ g_free(vfu_path);
+ g_free(pci_dev_path);
+ break;
+ } else {
+ VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s",
+ o->device, strerror(errno));
+ break;
+ }
+ }
+ }
+}
+
+static void vfu_object_attach_ctx(void *opaque)
+{
+ VfuObject *o = opaque;
+ GPollFD pfds[1];
+ int ret;
+
+ qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
+
+ pfds[0].fd = o->vfu_poll_fd;
+ pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
+
+retry_attach:
+ ret = vfu_attach_ctx(o->vfu_ctx);
+ if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+ /**
+ * vfu_object_attach_ctx can block QEMU's main loop
+ * during attach - the monitor and other IO
+ * could be unresponsive during this time.
+ */
+ (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
+ goto retry_attach;
+ } else if (ret < 0) {
+ VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s",
+ o->device, strerror(errno));
+ return;
+ }
+
+ o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
+ if (o->vfu_poll_fd < 0) {
+ VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device);
+ return;
+ }
+
+ qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
+}
+
+static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
+ size_t count, loff_t offset,
+ const bool is_write)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ uint32_t pci_access_width = sizeof(uint32_t);
+ size_t bytes = count;
+ uint32_t val = 0;
+ char *ptr = buf;
+ int len;
+
+ /*
+ * Writes to the BAR registers would trigger an update to the
+ * global Memory and IO AddressSpaces. But the remote device
+ * never uses the global AddressSpaces, therefore overlapping
+ * memory regions are not a problem
+ */
+ while (bytes > 0) {
+ len = (bytes > pci_access_width) ? pci_access_width : bytes;
+ if (is_write) {
+ memcpy(&val, ptr, len);
+ pci_host_config_write_common(o->pci_dev, offset,
+ pci_config_size(o->pci_dev),
+ val, len);
+ trace_vfu_cfg_write(offset, val);
+ } else {
+ val = pci_host_config_read_common(o->pci_dev, offset,
+ pci_config_size(o->pci_dev), len);
+ memcpy(ptr, &val, len);
+ trace_vfu_cfg_read(offset, val);
+ }
+ offset += len;
+ ptr += len;
+ bytes -= len;
+ }
+
+ return count;
+}
+
+static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ AddressSpace *dma_as = NULL;
+ MemoryRegion *subregion = NULL;
+ g_autofree char *name = NULL;
+ struct iovec *iov = &info->iova;
+
+ if (!info->vaddr) {
+ return;
+ }
+
+ name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
+ (uint64_t)info->vaddr);
+
+ subregion = g_new0(MemoryRegion, 1);
+
+ memory_region_init_ram_ptr(subregion, NULL, name,
+ iov->iov_len, info->vaddr);
+
+ dma_as = pci_device_iommu_address_space(o->pci_dev);
+
+ memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion);
+
+ trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
+}
+
+static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ AddressSpace *dma_as = NULL;
+ MemoryRegion *mr = NULL;
+ ram_addr_t offset;
+
+ mr = memory_region_from_host(info->vaddr, &offset);
+ if (!mr) {
+ return;
+ }
+
+ dma_as = pci_device_iommu_address_space(o->pci_dev);
+
+ memory_region_del_subregion(dma_as->root, mr);
+
+ object_unparent((OBJECT(mr)));
+
+ trace_vfu_dma_unregister((uint64_t)info->iova.iov_base);
+}
+
+static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset,
+ hwaddr size, const bool is_write)
+{
+ uint8_t *ptr = buf;
+ bool release_lock = false;
+ uint8_t *ram_ptr = NULL;
+ MemTxResult result;
+ int access_size;
+ uint64_t val;
+
+ if (memory_access_is_direct(mr, is_write)) {
+ /**
+ * Some devices expose a PCI expansion ROM, which could be buffer
+ * based as compared to other regions which are primarily based on
+ * MemoryRegionOps. memory_region_find() would already check
+ * for buffer overflow, we don't need to repeat it here.
+ */
+ ram_ptr = memory_region_get_ram_ptr(mr);
+
+ if (is_write) {
+ memcpy((ram_ptr + offset), buf, size);
+ } else {
+ memcpy(buf, (ram_ptr + offset), size);
+ }
+
+ return 0;
+ }
+
+ while (size) {
+ /**
+ * The read/write logic used below is similar to the ones in
+ * flatview_read/write_continue()
+ */
+ release_lock = prepare_mmio_access(mr);
+
+ access_size = memory_access_size(mr, size, offset);
+
+ if (is_write) {
+ val = ldn_he_p(ptr, access_size);
+
+ result = memory_region_dispatch_write(mr, offset, val,
+ size_memop(access_size),
+ MEMTXATTRS_UNSPECIFIED);
+ } else {
+ result = memory_region_dispatch_read(mr, offset, &val,
+ size_memop(access_size),
+ MEMTXATTRS_UNSPECIFIED);
+
+ stn_he_p(ptr, access_size, val);
+ }
+
+ if (release_lock) {
+ qemu_mutex_unlock_iothread();
+ release_lock = false;
+ }
+
+ if (result != MEMTX_OK) {
+ return -1;
+ }
+
+ size -= access_size;
+ ptr += access_size;
+ offset += access_size;
+ }
+
+ return 0;
+}
+
+static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar,
+ hwaddr bar_offset, char * const buf,
+ hwaddr len, const bool is_write)
+{
+ MemoryRegionSection section = { 0 };
+ uint8_t *ptr = (uint8_t *)buf;
+ MemoryRegion *section_mr = NULL;
+ uint64_t section_size;
+ hwaddr section_offset;
+ hwaddr size = 0;
+
+ while (len) {
+ section = memory_region_find(pci_dev->io_regions[pci_bar].memory,
+ bar_offset, len);
+
+ if (!section.mr) {
+ warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset);
+ return size;
+ }
+
+ section_mr = section.mr;
+ section_offset = section.offset_within_region;
+ section_size = int128_get64(section.size);
+
+ if (is_write && section_mr->readonly) {
+ warn_report("vfu: attempting to write to readonly region in "
+ "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]",
+ pci_bar, bar_offset,
+ (bar_offset + section_size));
+ memory_region_unref(section_mr);
+ return size;
+ }
+
+ if (vfu_object_mr_rw(section_mr, ptr, section_offset,
+ section_size, is_write)) {
+ warn_report("vfu: failed to %s "
+ "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d",
+ is_write ? "write to" : "read from", bar_offset,
+ (bar_offset + section_size), pci_bar);
+ memory_region_unref(section_mr);
+ return size;
+ }
+
+ size += section_size;
+ bar_offset += section_size;
+ ptr += section_size;
+ len -= section_size;
+
+ memory_region_unref(section_mr);
+ }
+
+ return size;
+}
+
+/**
+ * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs.
+ *
+ * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would
+ * define vfu_object_bar2_handler
+ */
+#define VFU_OBJECT_BAR_HANDLER(BAR_NO) \
+ static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx, \
+ char * const buf, size_t count, \
+ loff_t offset, const bool is_write) \
+ { \
+ VfuObject *o = vfu_get_private(vfu_ctx); \
+ PCIDevice *pci_dev = o->pci_dev; \
+ \
+ return vfu_object_bar_rw(pci_dev, BAR_NO, offset, \
+ buf, count, is_write); \
+ } \
+
+VFU_OBJECT_BAR_HANDLER(0)
+VFU_OBJECT_BAR_HANDLER(1)
+VFU_OBJECT_BAR_HANDLER(2)
+VFU_OBJECT_BAR_HANDLER(3)
+VFU_OBJECT_BAR_HANDLER(4)
+VFU_OBJECT_BAR_HANDLER(5)
+VFU_OBJECT_BAR_HANDLER(6)
+
+static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
+ &vfu_object_bar0_handler,
+ &vfu_object_bar1_handler,
+ &vfu_object_bar2_handler,
+ &vfu_object_bar3_handler,
+ &vfu_object_bar4_handler,
+ &vfu_object_bar5_handler,
+ &vfu_object_bar6_handler,
+};
+
+/**
+ * vfu_object_register_bars - Identify active BAR regions of pdev and setup
+ * callbacks to handle read/write accesses
+ */
+static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
+{
+ int flags = VFU_REGION_FLAG_RW;
+ int i;
+
+ for (i = 0; i < PCI_NUM_REGIONS; i++) {
+ if (!pdev->io_regions[i].size) {
+ continue;
+ }
+
+ if ((i == VFU_PCI_DEV_ROM_REGION_IDX) ||
+ pdev->io_regions[i].memory->readonly) {
+ flags &= ~VFU_REGION_FLAG_WRITE;
+ }
+
+ vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i,
+ (size_t)pdev->io_regions[i].size,
+ vfu_object_bar_handlers[i],
+ flags, NULL, 0, -1, 0);
+
+ trace_vfu_bar_register(i, pdev->io_regions[i].addr,
+ pdev->io_regions[i].size);
+ }
+}
+
+static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
+{
+ int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
+ pci_dev->devfn);
+
+ return pci_bdf;
+}
+
+static void vfu_object_set_irq(void *opaque, int pirq, int level)
+{
+ PCIBus *pci_bus = opaque;
+ PCIDevice *pci_dev = NULL;
+ vfu_ctx_t *vfu_ctx = NULL;
+ int pci_bus_num, devfn;
+
+ if (level) {
+ pci_bus_num = PCI_BUS_NUM(pirq);
+ devfn = PCI_BDF_TO_DEVFN(pirq);
+
+ /*
+ * pci_find_device() performs at O(1) if the device is attached
+ * to the root PCI bus. Whereas, if the device is attached to a
+ * secondary PCI bus (such as when a root port is involved),
+ * finding the parent PCI bus could take O(n)
+ */
+ pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn);
+
+ vfu_ctx = pci_dev->irq_opaque;
+
+ g_assert(vfu_ctx);
+
+ vfu_irq_trigger(vfu_ctx, 0);
+ }
+}
+
+static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev,
+ unsigned int vector)
+{
+ MSIMessage msg;
+
+ msg.address = 0;
+ msg.data = vector;
+
+ return msg;
+}
+
+static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg)
+{
+ vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque;
+
+ vfu_irq_trigger(vfu_ctx, msg.data);
+}
+
+static void vfu_object_setup_msi_cbs(VfuObject *o)
+{
+ o->default_msi_trigger = o->pci_dev->msi_trigger;
+ o->default_msi_prepare_message = o->pci_dev->msi_prepare_message;
+ o->default_msix_prepare_message = o->pci_dev->msix_prepare_message;
+
+ o->pci_dev->msi_trigger = vfu_object_msi_trigger;
+ o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg;
+ o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg;
+}
+
+static void vfu_object_restore_msi_cbs(VfuObject *o)
+{
+ o->pci_dev->msi_trigger = o->default_msi_trigger;
+ o->pci_dev->msi_prepare_message = o->default_msi_prepare_message;
+ o->pci_dev->msix_prepare_message = o->default_msix_prepare_message;
+}
+
+static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
+ uint32_t count, bool mask)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ Error *err = NULL;
+ uint32_t vector;
+
+ for (vector = start; vector < count; vector++) {
+ msix_set_mask(o->pci_dev, vector, mask, &err);
+ if (err) {
+ VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
+ error_get_pretty(err));
+ error_free(err);
+ err = NULL;
+ }
+ }
+}
+
+static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
+ uint32_t count, bool mask)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ Error *err = NULL;
+ uint32_t vector;
+
+ for (vector = start; vector < count; vector++) {
+ msi_set_mask(o->pci_dev, vector, mask, &err);
+ if (err) {
+ VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
+ error_get_pretty(err));
+ error_free(err);
+ err = NULL;
+ }
+ }
+}
+
+static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
+{
+ vfu_ctx_t *vfu_ctx = o->vfu_ctx;
+ int ret;
+
+ ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (msix_nr_vectors_allocated(pci_dev)) {
+ ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
+ msix_nr_vectors_allocated(pci_dev));
+ vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ,
+ &vfu_msix_irq_state);
+ } else if (msi_nr_vectors_allocated(pci_dev)) {
+ ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
+ msi_nr_vectors_allocated(pci_dev));
+ vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ,
+ &vfu_msi_irq_state);
+ }
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ vfu_object_setup_msi_cbs(o);
+
+ pci_dev->irq_opaque = vfu_ctx;
+
+ return 0;
+}
+
+void vfu_object_set_bus_irq(PCIBus *pci_bus)
+{
+ int bus_num = pci_bus_num(pci_bus);
+ int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1);
+
+ pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
+ max_bdf);
+}
+
+static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+
+ /* vfu_object_ctx_run() handles lost connection */
+ if (type == VFU_RESET_LOST_CONN) {
+ return 0;
+ }
+
+ qdev_reset_all(DEVICE(o->pci_dev));
+
+ return 0;
+}
+
+/*
+ * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
+ * properties. It also depends on devices instantiated in QEMU. These
+ * dependencies are not available during the instance_init phase of this
+ * object's life-cycle. As such, the server is initialized after the
+ * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT
+ * when the machine is setup, and the dependencies are available.
+ */
+static void vfu_object_machine_done(Notifier *notifier, void *data)
+{
+ VfuObject *o = container_of(notifier, VfuObject, machine_done);
+ Error *err = NULL;
+
+ vfu_object_init_ctx(o, &err);
+
+ if (err) {
+ error_propagate(&error_abort, err);
+ }
+}
+
+/**
+ * vfu_object_init_ctx: Create and initialize libvfio-user context. Add
+ * an unplug blocker for the associated PCI device. Setup a FD handler
+ * to process incoming messages in the context's socket.
+ *
+ * The socket and device properties are mandatory, and this function
+ * will not create the context without them - the setters for these
+ * properties should call this function when the property is set. The
+ * machine should also be ready when this function is invoked - it is
+ * because QEMU objects are initialized before devices, and the
+ * associated PCI device wouldn't be available at the object
+ * initialization time. Until these conditions are satisfied, this
+ * function would return early without performing any task.
+ */
+static void vfu_object_init_ctx(VfuObject *o, Error **errp)
+{
+ ERRP_GUARD();
+ DeviceState *dev = NULL;
+ vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
+ int ret;
+
+ if (o->vfu_ctx || !o->socket || !o->device ||
+ !phase_check(PHASE_MACHINE_READY)) {
+ return;
+ }
+
+ if (o->err) {
+ error_propagate(errp, o->err);
+ o->err = NULL;
+ return;
+ }
+
+ o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
+ LIBVFIO_USER_FLAG_ATTACH_NB,
+ o, VFU_DEV_TYPE_PCI);
+ if (o->vfu_ctx == NULL) {
+ error_setg(errp, "vfu: Failed to create context - %s", strerror(errno));
+ return;
+ }
+
+ dev = qdev_find_recursive(sysbus_get_default(), o->device);
+ if (dev == NULL) {
+ error_setg(errp, "vfu: Device %s not found", o->device);
+ goto fail;
+ }
+
+ if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+ error_setg(errp, "vfu: %s not a PCI device", o->device);
+ goto fail;
+ }
+
+ o->pci_dev = PCI_DEVICE(dev);
+
+ object_ref(OBJECT(o->pci_dev));
+
+ if (pci_is_express(o->pci_dev)) {
+ pci_type = VFU_PCI_TYPE_EXPRESS;
+ }
+
+ ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0);
+ if (ret < 0) {
+ error_setg(errp,
+ "vfu: Failed to attach PCI device %s to context - %s",
+ o->device, strerror(errno));
+ goto fail;
+ }
+
+ error_setg(&o->unplug_blocker,
+ "vfu: %s for %s must be deleted before unplugging",
+ TYPE_VFU_OBJECT, o->device);
+ qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
+
+ ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX,
+ pci_config_size(o->pci_dev), &vfu_object_cfg_access,
+ VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB,
+ NULL, 0, -1, 0);
+ if (ret < 0) {
+ error_setg(errp,
+ "vfu: Failed to setup config space handlers for %s- %s",
+ o->device, strerror(errno));
+ goto fail;
+ }
+
+ ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to setup DMA handlers for %s",
+ o->device);
+ goto fail;
+ }
+
+ vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
+
+ ret = vfu_object_setup_irqs(o, o->pci_dev);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to setup interrupts for %s",
+ o->device);
+ goto fail;
+ }
+
+ ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to setup reset callback");
+ goto fail;
+ }
+
+ ret = vfu_realize_ctx(o->vfu_ctx);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to realize device %s- %s",
+ o->device, strerror(errno));
+ goto fail;
+ }
+
+ o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
+ if (o->vfu_poll_fd < 0) {
+ error_setg(errp, "vfu: Failed to get poll fd %s", o->device);
+ goto fail;
+ }
+
+ qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o);
+
+ return;
+
+fail:
+ vfu_destroy_ctx(o->vfu_ctx);
+ if (o->unplug_blocker && o->pci_dev) {
+ qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
+ error_free(o->unplug_blocker);
+ o->unplug_blocker = NULL;
+ }
+ if (o->pci_dev) {
+ vfu_object_restore_msi_cbs(o);
+ o->pci_dev->irq_opaque = NULL;
+ object_unref(OBJECT(o->pci_dev));
+ o->pci_dev = NULL;
+ }
+ o->vfu_ctx = NULL;
+}
+
+static void vfu_object_init(Object *obj)
+{
+ VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
+ VfuObject *o = VFU_OBJECT(obj);
+
+ k->nr_devs++;
+
+ if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) {
+ error_setg(&o->err, "vfu: %s only compatible with %s machine",
+ TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
+ return;
+ }
+
+ if (!phase_check(PHASE_MACHINE_READY)) {
+ o->machine_done.notify = vfu_object_machine_done;
+ qemu_add_machine_init_done_notifier(&o->machine_done);
+ }
+
+ o->vfu_poll_fd = -1;
+}
+
+static void vfu_object_finalize(Object *obj)
+{
+ VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
+ VfuObject *o = VFU_OBJECT(obj);
+
+ k->nr_devs--;
+
+ qapi_free_SocketAddress(o->socket);
+
+ o->socket = NULL;
+
+ if (o->vfu_poll_fd != -1) {
+ qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
+ o->vfu_poll_fd = -1;
+ }
+
+ if (o->vfu_ctx) {
+ vfu_destroy_ctx(o->vfu_ctx);
+ o->vfu_ctx = NULL;
+ }
+
+ g_free(o->device);
+
+ o->device = NULL;
+
+ if (o->unplug_blocker && o->pci_dev) {
+ qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
+ error_free(o->unplug_blocker);
+ o->unplug_blocker = NULL;
+ }
+
+ if (o->pci_dev) {
+ vfu_object_restore_msi_cbs(o);
+ o->pci_dev->irq_opaque = NULL;
+ object_unref(OBJECT(o->pci_dev));
+ o->pci_dev = NULL;
+ }
+
+ if (!k->nr_devs && vfu_object_auto_shutdown()) {
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
+ }
+
+ if (o->machine_done.notify) {
+ qemu_remove_machine_init_done_notifier(&o->machine_done);
+ o->machine_done.notify = NULL;
+ }
+}
+
+static void vfu_object_class_init(ObjectClass *klass, void *data)
+{
+ VfuObjectClass *k = VFU_OBJECT_CLASS(klass);
+
+ k->nr_devs = 0;
+
+ object_class_property_add(klass, "socket", "SocketAddress", NULL,
+ vfu_object_set_socket, NULL, NULL);
+ object_class_property_set_description(klass, "socket",
+ "SocketAddress "
+ "(ex: type=unix,path=/tmp/sock). "
+ "Only UNIX is presently supported");
+ object_class_property_add_str(klass, "device", NULL,
+ vfu_object_set_device);
+ object_class_property_set_description(klass, "device",
+ "device ID - only PCI devices "
+ "are presently supported");
+}
+
+static const TypeInfo vfu_object_info = {
+ .name = TYPE_VFU_OBJECT,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(VfuObject),
+ .instance_init = vfu_object_init,
+ .instance_finalize = vfu_object_finalize,
+ .class_size = sizeof(VfuObjectClass),
+ .class_init = vfu_object_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void vfu_register_types(void)
+{
+ type_register_static(&vfu_object_info);
+}
+
+type_init(vfu_register_types);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index f1c1945..a6a0f4d 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -2810,6 +2810,9 @@ MemTxResult address_space_write_cached_slow(MemoryRegionCache *cache,
hwaddr addr, const void *buf,
hwaddr len);
+int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr);
+bool prepare_mmio_access(MemoryRegion *mr);
+
static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
{
if (is_write) {
diff --git a/include/hw/pci/msi.h b/include/hw/pci/msi.h
index 40876884..58aa576 100644
--- a/include/hw/pci/msi.h
+++ b/include/hw/pci/msi.h
@@ -43,6 +43,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector);
void msi_send_message(PCIDevice *dev, MSIMessage msg);
void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len);
unsigned int msi_nr_vectors_allocated(const PCIDevice *dev);
+void msi_set_mask(PCIDevice *dev, int vector, bool mask, Error **errp);
static inline bool msi_present(const PCIDevice *dev)
{
diff --git a/include/hw/pci/msix.h b/include/hw/pci/msix.h
index 4c4a60c..4f1cda0 100644
--- a/include/hw/pci/msix.h
+++ b/include/hw/pci/msix.h
@@ -36,6 +36,7 @@ void msix_clr_pending(PCIDevice *dev, int vector);
int msix_vector_use(PCIDevice *dev, unsigned vector);
void msix_vector_unuse(PCIDevice *dev, unsigned vector);
void msix_unuse_all_vectors(PCIDevice *dev);
+void msix_set_mask(PCIDevice *dev, int vector, bool mask, Error **errp);
void msix_notify(PCIDevice *dev, unsigned vector);
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 44dacfa..b54b6ef 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -16,6 +16,7 @@ extern bool pci_available;
#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f)
#define PCI_FUNC(devfn) ((devfn) & 0x07)
#define PCI_BUILD_BDF(bus, devfn) ((bus << 8) | (devfn))
+#define PCI_BDF_TO_DEVFN(x) ((x) & 0xff)
#define PCI_BUS_MAX 256
#define PCI_DEVFN_MAX 256
#define PCI_SLOT_MAX 32
@@ -127,6 +128,10 @@ typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
pcibus_t addr, pcibus_t size, int type);
typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
+typedef void MSITriggerFunc(PCIDevice *dev, MSIMessage msg);
+typedef MSIMessage MSIPrepareMessageFunc(PCIDevice *dev, unsigned vector);
+typedef MSIMessage MSIxPrepareMessageFunc(PCIDevice *dev, unsigned vector);
+
typedef struct PCIIORegion {
pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
#define PCI_BAR_UNMAPPED (~(pcibus_t)0)
@@ -329,6 +334,14 @@ struct PCIDevice {
/* Space to store MSIX table & pending bit array */
uint8_t *msix_table;
uint8_t *msix_pba;
+
+ /* May be used by INTx or MSI during interrupt notification */
+ void *irq_opaque;
+
+ MSITriggerFunc *msi_trigger;
+ MSIPrepareMessageFunc *msi_prepare_message;
+ MSIxPrepareMessageFunc *msix_prepare_message;
+
/* MemoryRegion container for msix exclusive BAR setup */
MemoryRegion msix_exclusive_bar;
/* Memory Regions for MSIX table and pending bit entries. */
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 92c3d65..98774e2 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -193,6 +193,7 @@ struct DeviceState {
int instance_id_alias;
int alias_required_for_version;
ResettableState reset;
+ GSList *unplug_blockers;
};
struct DeviceListener {
@@ -420,6 +421,34 @@ void qdev_machine_creation_done(void);
bool qdev_machine_modified(void);
/**
+ * qdev_add_unplug_blocker: Add an unplug blocker to a device
+ *
+ * @dev: Device to be blocked from unplug
+ * @reason: Reason for blocking
+ */
+void qdev_add_unplug_blocker(DeviceState *dev, Error *reason);
+
+/**
+ * qdev_del_unplug_blocker: Remove an unplug blocker from a device
+ *
+ * @dev: Device to be unblocked
+ * @reason: Pointer to the Error used with qdev_add_unplug_blocker.
+ * Used as a handle to lookup the blocker for deletion.
+ */
+void qdev_del_unplug_blocker(DeviceState *dev, Error *reason);
+
+/**
+ * qdev_unplug_blocked: Confirm if a device is blocked from unplug
+ *
+ * @dev: Device to be tested
+ * @reason: Returns one of the reasons why the device is blocked,
+ * if any
+ *
+ * Returns: true if device is blocked from unplug, false otherwise
+ */
+bool qdev_unplug_blocked(DeviceState *dev, Error **errp);
+
+/**
* GpioPolarity: Polarity of a GPIO line
*
* GPIO lines use either positive (active-high) logic,
diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
new file mode 100644
index 0000000..33b68a8
--- /dev/null
+++ b/include/hw/remote/iommu.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright © 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_IOMMU_H
+#define REMOTE_IOMMU_H
+
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
+
+#ifndef INT2VOIDP
+#define INT2VOIDP(i) (void *)(uintptr_t)(i)
+#endif
+
+typedef struct RemoteIommuElem {
+ MemoryRegion *mr;
+
+ AddressSpace as;
+} RemoteIommuElem;
+
+#define TYPE_REMOTE_IOMMU "x-remote-iommu"
+OBJECT_DECLARE_SIMPLE_TYPE(RemoteIommu, REMOTE_IOMMU)
+
+struct RemoteIommu {
+ Object parent;
+
+ GHashTable *elem_by_devfn;
+
+ QemuMutex lock;
+};
+
+void remote_iommu_setup(PCIBus *pci_bus);
+
+void remote_iommu_unplug_dev(PCIDevice *pci_dev);
+
+#endif
diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
index 2a2a33c..ac32fda 100644
--- a/include/hw/remote/machine.h
+++ b/include/hw/remote/machine.h
@@ -22,6 +22,10 @@ struct RemoteMachineState {
RemotePCIHost *host;
RemoteIOHubState iohub;
+
+ bool vfio_user;
+
+ bool auto_shutdown;
};
/* Used to pass to co-routine device and ioc. */
diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-user-obj.h
new file mode 100644
index 0000000..87ab78b
--- /dev/null
+++ b/include/hw/remote/vfio-user-obj.h
@@ -0,0 +1,6 @@
+#ifndef VFIO_USER_OBJ_H
+#define VFIO_USER_OBJ_H
+
+void vfu_object_set_bus_irq(PCIBus *pci_bus);
+
+#endif
diff --git a/meson.build b/meson.build
index 0c2e11f..ca19ddc 100644
--- a/meson.build
+++ b/meson.build
@@ -308,6 +308,10 @@ multiprocess_allowed = get_option('multiprocess') \
.require(targetos == 'linux', error_message: 'Multiprocess QEMU is supported only on Linux') \
.allowed()
+vfio_user_server_allowed = get_option('vfio_user_server') \
+ .require(targetos == 'linux', error_message: 'vfio-user server is supported only on Linux') \
+ .allowed()
+
have_tpm = get_option('tpm') \
.require(targetos != 'windows', error_message: 'TPM emulation only available on POSIX systems') \
.allowed()
@@ -1752,6 +1756,7 @@ config_host_data.set('CONFIG_LIBNFS', libnfs.found())
config_host_data.set('CONFIG_LIBSSH', libssh.found())
config_host_data.set('CONFIG_LINUX_AIO', libaio.found())
config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
+config_host_data.set('CONFIG_LIBURING_REGISTER_RING_FD', cc.has_function('io_uring_register_ring_fd', prefix: '#include <liburing.h>', dependencies:linux_io_uring))
config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
config_host_data.set('CONFIG_NUMA', numa.found())
config_host_data.set('CONFIG_OPENGL', opengl.found())
@@ -2379,7 +2384,8 @@ host_kconfig = \
(have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \
('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \
(have_pvrdma ? ['CONFIG_PVRDMA=y'] : []) + \
- (multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : [])
+ (multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : []) + \
+ (vfio_user_server_allowed ? ['CONFIG_VFIO_USER_SERVER_ALLOWED=y'] : [])
ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ]
@@ -2671,6 +2677,21 @@ if have_system
endif
endif
+libvfio_user_dep = not_found
+if have_system and vfio_user_server_allowed
+ have_internal = fs.exists(meson.current_source_dir() / 'subprojects/libvfio-user/meson.build')
+
+ if not have_internal
+ error('libvfio-user source not found - please pull git submodule')
+ endif
+
+ libvfio_user_proj = subproject('libvfio-user')
+
+ libvfio_user_lib = libvfio_user_proj.get_variable('libvfio_user_dep')
+
+ libvfio_user_dep = declare_dependency(dependencies: [libvfio_user_lib])
+endif
+
fdt = not_found
if have_system
fdt_opt = get_option('fdt')
@@ -3789,6 +3810,7 @@ summary_info += {'target list': ' '.join(target_dirs)}
if have_system
summary_info += {'default devices': get_option('default_devices')}
summary_info += {'out of process emulation': multiprocess_allowed}
+ summary_info += {'vfio-user server': vfio_user_server_allowed}
endif
summary(summary_info, bool_yn: true, section: 'Targets and accelerators')
diff --git a/meson_options.txt b/meson_options.txt
index 0e81973..f3e2f22 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -88,6 +88,8 @@ option('cfi_debug', type: 'boolean', value: 'false',
description: 'Verbose errors in case of CFI violation')
option('multiprocess', type: 'feature', value: 'auto',
description: 'Out of process device emulation support')
+option('vfio_user_server', type: 'feature', value: 'disabled',
+ description: 'vfio-user server support')
option('dbus_display', type: 'feature', value: 'auto',
description: '-display dbus support')
option('tpm', type : 'feature', value : 'auto',
diff --git a/qapi/misc.json b/qapi/misc.json
index 4534448..27ef5a2 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -553,3 +553,34 @@
##
{ 'event': 'RTC_CHANGE',
'data': { 'offset': 'int', 'qom-path': 'str' } }
+
+##
+# @VFU_CLIENT_HANGUP:
+#
+# Emitted when the client of a TYPE_VFIO_USER_SERVER closes the
+# communication channel
+#
+# @vfu-id: ID of the TYPE_VFIO_USER_SERVER object. It is the last component
+# of @vfu-qom-path referenced below
+#
+# @vfu-qom-path: path to the TYPE_VFIO_USER_SERVER object in the QOM tree
+#
+# @dev-id: ID of attached PCI device
+#
+# @dev-qom-path: path to attached PCI device in the QOM tree
+#
+# Since: 7.1
+#
+# Example:
+#
+# <- { "event": "VFU_CLIENT_HANGUP",
+# "data": { "vfu-id": "vfu1",
+# "vfu-qom-path": "/objects/vfu1",
+# "dev-id": "sas1",
+# "dev-qom-path": "/machine/peripheral/sas1" },
+# "timestamp": { "seconds": 1265044230, "microseconds": 450486 } }
+#
+##
+{ 'event': 'VFU_CLIENT_HANGUP',
+ 'data': { 'vfu-id': 'str', 'vfu-qom-path': 'str',
+ 'dev-id': 'str', 'dev-qom-path': 'str' } }
diff --git a/qapi/qom.json b/qapi/qom.json
index 6a653c6..80dd419 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -735,6 +735,20 @@
'data': { 'fd': 'str', 'devid': 'str' } }
##
+# @VfioUserServerProperties:
+#
+# Properties for x-vfio-user-server objects.
+#
+# @socket: socket to be used by the libvfio-user library
+#
+# @device: the ID of the device to be emulated at the server
+#
+# Since: 7.1
+##
+{ 'struct': 'VfioUserServerProperties',
+ 'data': { 'socket': 'SocketAddress', 'device': 'str' } }
+
+##
# @RngProperties:
#
# Properties for objects of classes derived from rng.
@@ -874,7 +888,8 @@
'tls-creds-psk',
'tls-creds-x509',
'tls-cipher-suites',
- { 'name': 'x-remote-object', 'features': [ 'unstable' ] }
+ { 'name': 'x-remote-object', 'features': [ 'unstable' ] },
+ { 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] }
] }
##
@@ -938,7 +953,8 @@
'tls-creds-psk': 'TlsCredsPskProperties',
'tls-creds-x509': 'TlsCredsX509Properties',
'tls-cipher-suites': 'TlsCredsProperties',
- 'x-remote-object': 'RemoteObjectProperties'
+ 'x-remote-object': 'RemoteObjectProperties',
+ 'x-vfio-user-server': 'VfioUserServerProperties'
} }
##
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 1fc1d2e..24eb5f3 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -153,6 +153,8 @@ meson_options_help() {
printf "%s\n" ' usb-redir libusbredir support'
printf "%s\n" ' vde vde network backend support'
printf "%s\n" ' vdi vdi image format support'
+ printf "%s\n" ' vfio-user-server'
+ printf "%s\n" ' vfio-user server support'
printf "%s\n" ' vhost-crypto vhost-user crypto backend support'
printf "%s\n" ' vhost-kernel vhost kernel backend support'
printf "%s\n" ' vhost-net vhost-net kernel acceleration support'
@@ -415,6 +417,8 @@ _meson_option_parse() {
--disable-vde) printf "%s" -Dvde=disabled ;;
--enable-vdi) printf "%s" -Dvdi=enabled ;;
--disable-vdi) printf "%s" -Dvdi=disabled ;;
+ --enable-vfio-user-server) printf "%s" -Dvfio_user_server=enabled ;;
+ --disable-vfio-user-server) printf "%s" -Dvfio_user_server=disabled ;;
--enable-vhost-crypto) printf "%s" -Dvhost_crypto=enabled ;;
--disable-vhost-crypto) printf "%s" -Dvhost_crypto=disabled ;;
--enable-vhost-kernel) printf "%s" -Dvhost_kernel=enabled ;;
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 657841e..fb16be5 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -2719,7 +2719,7 @@ void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
invalidate_and_set_dirty(mr, addr, size);
}
-static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
+int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
{
unsigned access_size_max = mr->ops->valid.max_access_size;
@@ -2746,7 +2746,7 @@ static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
return l;
}
-static bool prepare_mmio_access(MemoryRegion *mr)
+bool prepare_mmio_access(MemoryRegion *mr)
{
bool release_lock = false;
diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c
index bb5897f..4b0ef65 100644
--- a/softmmu/qdev-monitor.c
+++ b/softmmu/qdev-monitor.c
@@ -899,6 +899,10 @@ void qdev_unplug(DeviceState *dev, Error **errp)
HotplugHandlerClass *hdc;
Error *local_err = NULL;
+ if (qdev_unplug_blocked(dev, errp)) {
+ return;
+ }
+
if (dev->parent_bus && !qbus_is_hotpluggable(dev->parent_bus)) {
error_setg(errp, QERR_BUS_NO_HOTPLUG, dev->parent_bus->name);
return;
diff --git a/stubs/meson.build b/stubs/meson.build
index 6f80fec..d8f3fd5 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -60,3 +60,4 @@ if have_system
else
stub_ss.add(files('qdev.c'))
endif
+stub_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_false: files('vfio-user-obj.c'))
diff --git a/stubs/vfio-user-obj.c b/stubs/vfio-user-obj.c
new file mode 100644
index 0000000..79100d7
--- /dev/null
+++ b/stubs/vfio-user-obj.c
@@ -0,0 +1,6 @@
+#include "qemu/osdep.h"
+#include "hw/remote/vfio-user-obj.h"
+
+void vfu_object_set_bus_irq(PCIBus *pci_bus)
+{
+}
diff --git a/subprojects/libvfio-user b/subprojects/libvfio-user
new file mode 160000
+Subproject 0b28d205572c80b568a1003db2c8f37ca333e4d
diff --git a/tests/docker/dockerfiles/centos8.docker b/tests/docker/dockerfiles/centos8.docker
index 4b20925..10618bf 100644
--- a/tests/docker/dockerfiles/centos8.docker
+++ b/tests/docker/dockerfiles/centos8.docker
@@ -51,6 +51,7 @@ RUN dnf update -y && \
libbpf-devel \
libcacard-devel \
libcap-ng-devel \
+ libcmocka-devel \
libcurl-devel \
libdrm-devel \
libepoxy-devel \
@@ -59,6 +60,7 @@ RUN dnf update -y && \
libgcrypt-devel \
libiscsi-devel \
libjpeg-devel \
+ json-c-devel \
libnfs-devel \
libpmem-devel \
libpng-devel \
diff --git a/tests/qtest/fuzz/generic_fuzz.c b/tests/qtest/fuzz/generic_fuzz.c
index 25df19f..447ffe8 100644
--- a/tests/qtest/fuzz/generic_fuzz.c
+++ b/tests/qtest/fuzz/generic_fuzz.c
@@ -144,7 +144,7 @@ static void *pattern_alloc(pattern p, size_t len)
return buf;
}
-static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
+static int fuzz_memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
{
unsigned access_size_max = mr->ops->valid.max_access_size;
@@ -242,11 +242,12 @@ void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr)
/*
* If mr1 isn't RAM, address_space_translate doesn't update l. Use
- * memory_access_size to identify the number of bytes that it is safe
- * to write without accidentally writing to another MemoryRegion.
+ * fuzz_memory_access_size to identify the number of bytes that it
+ * is safe to write without accidentally writing to another
+ * MemoryRegion.
*/
if (!memory_region_is_ram(mr1)) {
- l = memory_access_size(mr1, l, addr1);
+ l = fuzz_memory_access_size(mr1, l, addr1);
}
if (memory_region_is_ram(mr1) ||
memory_region_is_romd(mr1) ||