aboutsummaryrefslogtreecommitdiff
path: root/hw
diff options
context:
space:
mode:
Diffstat (limited to 'hw')
-rw-r--r--hw/Kconfig1
-rw-r--r--hw/meson.build1
-rw-r--r--hw/misc/ivshmem.c3
-rw-r--r--hw/pci-host/Kconfig3
-rw-r--r--hw/pci-host/meson.build1
-rw-r--r--hw/pci-host/remote.c75
-rw-r--r--hw/remote/Kconfig4
-rw-r--r--hw/remote/iohub.c119
-rw-r--r--hw/remote/machine.c80
-rw-r--r--hw/remote/memory.c65
-rw-r--r--hw/remote/meson.build13
-rw-r--r--hw/remote/message.c230
-rw-r--r--hw/remote/mpqemu-link.c267
-rw-r--r--hw/remote/proxy-memory-listener.c227
-rw-r--r--hw/remote/proxy.c379
-rw-r--r--hw/remote/remote-obj.c203
-rw-r--r--hw/remote/trace-events4
-rw-r--r--hw/remote/trace.h1
18 files changed, 1675 insertions, 1 deletions
diff --git a/hw/Kconfig b/hw/Kconfig
index d4cec9e..8ea2647 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -27,6 +27,7 @@ source pci-host/Kconfig
source pcmcia/Kconfig
source pci/Kconfig
source rdma/Kconfig
+source remote/Kconfig
source rtc/Kconfig
source scsi/Kconfig
source sd/Kconfig
diff --git a/hw/meson.build b/hw/meson.build
index 010de72..e615d72 100644
--- a/hw/meson.build
+++ b/hw/meson.build
@@ -56,6 +56,7 @@ subdir('moxie')
subdir('nios2')
subdir('openrisc')
subdir('ppc')
+subdir('remote')
subdir('riscv')
subdir('rx')
subdir('s390x')
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index 0505b52..603e992 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -495,7 +495,8 @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
/* mmap the region and map into the BAR2 */
memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
- "ivshmem.bar2", size, true, fd, &local_err);
+ "ivshmem.bar2", size, true, fd, 0,
+ &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
diff --git a/hw/pci-host/Kconfig b/hw/pci-host/Kconfig
index eb03f04..8b8c763 100644
--- a/hw/pci-host/Kconfig
+++ b/hw/pci-host/Kconfig
@@ -65,3 +65,6 @@ config PCI_POWERNV
select PCI_EXPRESS
select MSI_NONBROKEN
select PCIE_PORT
+
+config REMOTE_PCIHOST
+ bool
diff --git a/hw/pci-host/meson.build b/hw/pci-host/meson.build
index da9d1a9..1847c69 100644
--- a/hw/pci-host/meson.build
+++ b/hw/pci-host/meson.build
@@ -9,6 +9,7 @@ pci_ss.add(when: 'CONFIG_PCI_EXPRESS_XILINX', if_true: files('xilinx-pcie.c'))
pci_ss.add(when: 'CONFIG_PCI_I440FX', if_true: files('i440fx.c'))
pci_ss.add(when: 'CONFIG_PCI_SABRE', if_true: files('sabre.c'))
pci_ss.add(when: 'CONFIG_XEN_IGD_PASSTHROUGH', if_true: files('xen_igd_pt.c'))
+pci_ss.add(when: 'CONFIG_REMOTE_PCIHOST', if_true: files('remote.c'))
# PPC devices
pci_ss.add(when: 'CONFIG_PREP_PCI', if_true: files('prep.c'))
diff --git a/hw/pci-host/remote.c b/hw/pci-host/remote.c
new file mode 100644
index 0000000..eee4544
--- /dev/null
+++ b/hw/pci-host/remote.c
@@ -0,0 +1,75 @@
+/*
+ * Remote PCI host device
+ *
+ * Unlike PCI host devices that model physical hardware, the purpose
+ * of this PCI host is to host multi-process QEMU devices.
+ *
+ * Multi-process QEMU extends the PCI host of a QEMU machine into a
+ * remote process. Any PCI device attached to the remote process is
+ * visible in the QEMU guest. This allows existing QEMU device models
+ * to be reused in the remote process.
+ *
+ * This PCI host is purely a container for PCI devices. It's fake in the
+ * sense that the guest never sees this PCI host and has no way of
+ * accessing it. Its job is just to provide the environment that QEMU
+ * PCI device models need when running in a remote process.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_host.h"
+#include "hw/pci/pcie_host.h"
+#include "hw/qdev-properties.h"
+#include "hw/pci-host/remote.h"
+#include "exec/memory.h"
+
+static const char *remote_pcihost_root_bus_path(PCIHostState *host_bridge,
+ PCIBus *rootbus)
+{
+ return "0000:00";
+}
+
+static void remote_pcihost_realize(DeviceState *dev, Error **errp)
+{
+ PCIHostState *pci = PCI_HOST_BRIDGE(dev);
+ RemotePCIHost *s = REMOTE_PCIHOST(dev);
+
+ pci->bus = pci_root_bus_new(DEVICE(s), "remote-pci",
+ s->mr_pci_mem, s->mr_sys_io,
+ 0, TYPE_PCIE_BUS);
+}
+
+static void remote_pcihost_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
+
+ hc->root_bus_path = remote_pcihost_root_bus_path;
+ dc->realize = remote_pcihost_realize;
+
+ dc->user_creatable = false;
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+ dc->fw_name = "pci";
+}
+
+static const TypeInfo remote_pcihost_info = {
+ .name = TYPE_REMOTE_PCIHOST,
+ .parent = TYPE_PCIE_HOST_BRIDGE,
+ .instance_size = sizeof(RemotePCIHost),
+ .class_init = remote_pcihost_class_init,
+};
+
+static void remote_pcihost_register(void)
+{
+ type_register_static(&remote_pcihost_info);
+}
+
+type_init(remote_pcihost_register)
diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig
new file mode 100644
index 0000000..08c16e2
--- /dev/null
+++ b/hw/remote/Kconfig
@@ -0,0 +1,4 @@
+config MULTIPROCESS
+ bool
+ depends on PCI && PCI_EXPRESS && KVM
+ select REMOTE_PCIHOST
diff --git a/hw/remote/iohub.c b/hw/remote/iohub.c
new file mode 100644
index 0000000..e4ff131
--- /dev/null
+++ b/hw/remote/iohub.c
@@ -0,0 +1,119 @@
+/*
+ * Remote IO Hub
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_ids.h"
+#include "hw/pci/pci_bus.h"
+#include "qemu/thread.h"
+#include "hw/boards.h"
+#include "hw/remote/machine.h"
+#include "hw/remote/iohub.h"
+#include "qemu/main-loop.h"
+
+void remote_iohub_init(RemoteIOHubState *iohub)
+{
+ int pirq;
+
+ memset(&iohub->irqfds, 0, sizeof(iohub->irqfds));
+ memset(&iohub->resamplefds, 0, sizeof(iohub->resamplefds));
+
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
+ qemu_mutex_init(&iohub->irq_level_lock[pirq]);
+ iohub->irq_level[pirq] = 0;
+ event_notifier_init_fd(&iohub->irqfds[pirq], -1);
+ event_notifier_init_fd(&iohub->resamplefds[pirq], -1);
+ }
+}
+
+void remote_iohub_finalize(RemoteIOHubState *iohub)
+{
+ int pirq;
+
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
+ NULL, NULL, NULL);
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
+ qemu_mutex_destroy(&iohub->irq_level_lock[pirq]);
+ }
+}
+
+int remote_iohub_map_irq(PCIDevice *pci_dev, int intx)
+{
+ return pci_dev->devfn;
+}
+
+void remote_iohub_set_irq(void *opaque, int pirq, int level)
+{
+ RemoteIOHubState *iohub = opaque;
+
+ assert(pirq >= 0);
+ assert(pirq < PCI_DEVFN_MAX);
+
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
+
+ if (level) {
+ if (++iohub->irq_level[pirq] == 1) {
+ event_notifier_set(&iohub->irqfds[pirq]);
+ }
+ } else if (iohub->irq_level[pirq] > 0) {
+ iohub->irq_level[pirq]--;
+ }
+}
+
+static void intr_resample_handler(void *opaque)
+{
+ ResampleToken *token = opaque;
+ RemoteIOHubState *iohub = token->iohub;
+ int pirq, s;
+
+ pirq = token->pirq;
+
+ s = event_notifier_test_and_clear(&iohub->resamplefds[pirq]);
+
+ assert(s >= 0);
+
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
+
+ if (iohub->irq_level[pirq]) {
+ event_notifier_set(&iohub->irqfds[pirq]);
+ }
+}
+
+void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg)
+{
+ RemoteMachineState *machine = REMOTE_MACHINE(current_machine);
+ RemoteIOHubState *iohub = &machine->iohub;
+ int pirq, intx;
+
+ intx = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
+
+ pirq = remote_iohub_map_irq(pci_dev, intx);
+
+ if (event_notifier_get_fd(&iohub->irqfds[pirq]) != -1) {
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
+ NULL, NULL, NULL);
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
+ memset(&iohub->token[pirq], 0, sizeof(ResampleToken));
+ }
+
+ event_notifier_init_fd(&iohub->irqfds[pirq], msg->fds[0]);
+ event_notifier_init_fd(&iohub->resamplefds[pirq], msg->fds[1]);
+
+ iohub->token[pirq].iohub = iohub;
+ iohub->token[pirq].pirq = pirq;
+
+ qemu_set_fd_handler(msg->fds[1], intr_resample_handler, NULL,
+ &iohub->token[pirq]);
+}
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
new file mode 100644
index 0000000..c0ab4f5
--- /dev/null
+++ b/hw/remote/machine.c
@@ -0,0 +1,80 @@
+/*
+ * Machine for remote device
+ *
+ * This machine type is used by the remote device process in multi-process
+ * QEMU. QEMU device models depend on parent busses, interrupt controllers,
+ * memory regions, etc. The remote machine type offers this environment so
+ * that QEMU device models can be used as remote devices.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/machine.h"
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+#include "qapi/error.h"
+#include "hw/pci/pci_host.h"
+#include "hw/remote/iohub.h"
+
+static void remote_machine_init(MachineState *machine)
+{
+ MemoryRegion *system_memory, *system_io, *pci_memory;
+ RemoteMachineState *s = REMOTE_MACHINE(machine);
+ RemotePCIHost *rem_host;
+ PCIHostState *pci_host;
+
+ system_memory = get_system_memory();
+ system_io = get_system_io();
+
+ pci_memory = g_new(MemoryRegion, 1);
+ memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
+
+ rem_host = REMOTE_PCIHOST(qdev_new(TYPE_REMOTE_PCIHOST));
+
+ rem_host->mr_pci_mem = pci_memory;
+ rem_host->mr_sys_mem = system_memory;
+ rem_host->mr_sys_io = system_io;
+
+ s->host = rem_host;
+
+ object_property_add_child(OBJECT(s), "remote-pcihost", OBJECT(rem_host));
+ memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1);
+
+ qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal);
+
+ pci_host = PCI_HOST_BRIDGE(rem_host);
+
+ remote_iohub_init(&s->iohub);
+
+ pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
+ &s->iohub, REMOTE_IOHUB_NB_PIRQS);
+}
+
+static void remote_machine_class_init(ObjectClass *oc, void *data)
+{
+ MachineClass *mc = MACHINE_CLASS(oc);
+
+ mc->init = remote_machine_init;
+ mc->desc = "Experimental remote machine";
+}
+
+static const TypeInfo remote_machine = {
+ .name = TYPE_REMOTE_MACHINE,
+ .parent = TYPE_MACHINE,
+ .instance_size = sizeof(RemoteMachineState),
+ .class_init = remote_machine_class_init,
+};
+
+static void remote_machine_register_types(void)
+{
+ type_register_static(&remote_machine);
+}
+
+type_init(remote_machine_register_types);
diff --git a/hw/remote/memory.c b/hw/remote/memory.c
new file mode 100644
index 0000000..32085b1
--- /dev/null
+++ b/hw/remote/memory.c
@@ -0,0 +1,65 @@
+/*
+ * Memory manager for remote device
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/memory.h"
+#include "exec/address-spaces.h"
+#include "exec/ram_addr.h"
+#include "qapi/error.h"
+
+static void remote_sysmem_reset(void)
+{
+ MemoryRegion *sysmem, *subregion, *next;
+
+ sysmem = get_system_memory();
+
+ QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) {
+ if (subregion->ram) {
+ memory_region_del_subregion(sysmem, subregion);
+ object_unparent(OBJECT(subregion));
+ }
+ }
+}
+
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ SyncSysmemMsg *sysmem_info = &msg->data.sync_sysmem;
+ MemoryRegion *sysmem, *subregion;
+ static unsigned int suffix;
+ int region;
+
+ sysmem = get_system_memory();
+
+ remote_sysmem_reset();
+
+ for (region = 0; region < msg->num_fds; region++) {
+ g_autofree char *name;
+ subregion = g_new(MemoryRegion, 1);
+ name = g_strdup_printf("remote-mem-%u", suffix++);
+ memory_region_init_ram_from_fd(subregion, NULL,
+ name, sysmem_info->sizes[region],
+ true, msg->fds[region],
+ sysmem_info->offsets[region],
+ errp);
+
+ if (*errp) {
+ g_free(subregion);
+ remote_sysmem_reset();
+ return;
+ }
+
+ memory_region_add_subregion(sysmem, sysmem_info->gpas[region],
+ subregion);
+
+ }
+}
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
new file mode 100644
index 0000000..e6a5574
--- /dev/null
+++ b/hw/remote/meson.build
@@ -0,0 +1,13 @@
+remote_ss = ss.source_set()
+
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
+
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c'))
+
+softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
diff --git a/hw/remote/message.c b/hw/remote/message.c
new file mode 100644
index 0000000..11d7298
--- /dev/null
+++ b/hw/remote/message.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
+ *
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/machine.h"
+#include "io/channel.h"
+#include "hw/remote/mpqemu-link.h"
+#include "qapi/error.h"
+#include "sysemu/runstate.h"
+#include "hw/pci/pci.h"
+#include "exec/memattrs.h"
+#include "hw/remote/memory.h"
+#include "hw/remote/iohub.h"
+#include "sysemu/reset.h"
+
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp);
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp);
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
+ Error **errp);
+
+void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
+{
+ g_autofree RemoteCommDev *com = (RemoteCommDev *)data;
+ PCIDevice *pci_dev = NULL;
+ Error *local_err = NULL;
+
+ assert(com->ioc);
+
+ pci_dev = com->dev;
+ for (; !local_err;) {
+ MPQemuMsg msg = {0};
+
+ if (!mpqemu_msg_recv(&msg, com->ioc, &local_err)) {
+ break;
+ }
+
+ if (!mpqemu_msg_valid(&msg)) {
+ error_setg(&local_err, "Received invalid message from proxy"
+ "in remote process pid="FMT_pid"",
+ getpid());
+ break;
+ }
+
+ switch (msg.cmd) {
+ case MPQEMU_CMD_PCI_CFGWRITE:
+ process_config_write(com->ioc, pci_dev, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_PCI_CFGREAD:
+ process_config_read(com->ioc, pci_dev, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_BAR_WRITE:
+ process_bar_write(com->ioc, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_BAR_READ:
+ process_bar_read(com->ioc, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_SYNC_SYSMEM:
+ remote_sysmem_reconfig(&msg, &local_err);
+ break;
+ case MPQEMU_CMD_SET_IRQFD:
+ process_set_irqfd_msg(pci_dev, &msg);
+ break;
+ case MPQEMU_CMD_DEVICE_RESET:
+ process_device_reset_msg(com->ioc, pci_dev, &local_err);
+ break;
+ default:
+ error_setg(&local_err,
+ "Unknown command (%d) received for device %s"
+ " (pid="FMT_pid")",
+ msg.cmd, DEVICE(pci_dev)->id, getpid());
+ }
+ }
+
+ if (local_err) {
+ error_report_err(local_err);
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
+ } else {
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
+ }
+}
+
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
+ MPQemuMsg ret = { 0 };
+
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
+ error_setg(errp, "Bad address for PCI config write, pid "FMT_pid".",
+ getpid());
+ ret.data.u64 = UINT64_MAX;
+ } else {
+ pci_default_write_config(dev, conf->addr, conf->val, conf->len);
+ }
+
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
+ MPQemuMsg ret = { 0 };
+
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
+ error_setg(errp, "Bad address for PCI config read, pid "FMT_pid".",
+ getpid());
+ ret.data.u64 = UINT64_MAX;
+ } else {
+ ret.data.u64 = pci_default_read_config(dev, conf->addr, conf->len);
+ }
+
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ BarAccessMsg *bar_access = &msg->data.bar_access;
+ AddressSpace *as =
+ bar_access->memory ? &address_space_memory : &address_space_io;
+ MPQemuMsg ret = { 0 };
+ MemTxResult res;
+ uint64_t val;
+
+ if (!is_power_of_2(bar_access->size) ||
+ (bar_access->size > sizeof(uint64_t))) {
+ ret.data.u64 = UINT64_MAX;
+ goto fail;
+ }
+
+ val = cpu_to_le64(bar_access->val);
+
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
+ (void *)&val, bar_access->size, true);
+
+ if (res != MEMTX_OK) {
+ error_setg(errp, "Bad address %"PRIx64" for mem write, pid "FMT_pid".",
+ bar_access->addr, getpid());
+ ret.data.u64 = -1;
+ }
+
+fail:
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ BarAccessMsg *bar_access = &msg->data.bar_access;
+ MPQemuMsg ret = { 0 };
+ AddressSpace *as;
+ MemTxResult res;
+ uint64_t val = 0;
+
+ as = bar_access->memory ? &address_space_memory : &address_space_io;
+
+ if (!is_power_of_2(bar_access->size) ||
+ (bar_access->size > sizeof(uint64_t))) {
+ val = UINT64_MAX;
+ goto fail;
+ }
+
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
+ (void *)&val, bar_access->size, false);
+
+ if (res != MEMTX_OK) {
+ error_setg(errp, "Bad address %"PRIx64" for mem read, pid "FMT_pid".",
+ bar_access->addr, getpid());
+ val = UINT64_MAX;
+ }
+
+fail:
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.data.u64 = le64_to_cpu(val);
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
+ Error **errp)
+{
+ DeviceClass *dc = DEVICE_GET_CLASS(dev);
+ DeviceState *s = DEVICE(dev);
+ MPQemuMsg ret = { 0 };
+
+ if (dc->reset) {
+ dc->reset(s);
+ }
+
+ ret.cmd = MPQEMU_CMD_RET;
+
+ mpqemu_msg_send(&ret, ioc, errp);
+}
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
new file mode 100644
index 0000000..9ce3152
--- /dev/null
+++ b/hw/remote/mpqemu-link.c
@@ -0,0 +1,267 @@
+/*
+ * Communication channel between QEMU and remote device process
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qemu/module.h"
+#include "hw/remote/mpqemu-link.h"
+#include "qapi/error.h"
+#include "qemu/iov.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "io/channel.h"
+#include "sysemu/iothread.h"
+#include "trace.h"
+
+/*
+ * Send message over the ioc QIOChannel.
+ * This function is safe to call from:
+ * - main loop in co-routine context. Will block the main loop if not in
+ * co-routine context;
+ * - vCPU thread with no co-routine context and if the channel is not part
+ * of the main loop handling;
+ * - IOThread within co-routine context, outside of co-routine context
+ * will block IOThread;
+ * Returns true if no errors were encountered, false otherwise.
+ */
+bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
+{
+ ERRP_GUARD();
+ bool iolock = qemu_mutex_iothread_locked();
+ bool iothread = qemu_in_iothread();
+ struct iovec send[2] = {};
+ int *fds = NULL;
+ size_t nfds = 0;
+ bool ret = false;
+
+ send[0].iov_base = msg;
+ send[0].iov_len = MPQEMU_MSG_HDR_SIZE;
+
+ send[1].iov_base = (void *)&msg->data;
+ send[1].iov_len = msg->size;
+
+ if (msg->num_fds) {
+ nfds = msg->num_fds;
+ fds = msg->fds;
+ }
+
+ /*
+ * Dont use in IOThread out of co-routine context as
+ * it will block IOThread.
+ */
+ assert(qemu_in_coroutine() || !iothread);
+
+ /*
+ * Skip unlocking/locking iothread lock when the IOThread is running
+ * in co-routine context. Co-routine context is asserted above
+ * for IOThread case.
+ * Also skip lock handling while in a co-routine in the main context.
+ */
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ qemu_mutex_unlock_iothread();
+ }
+
+ if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send),
+ fds, nfds, errp)) {
+ ret = true;
+ } else {
+ trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds);
+ }
+
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ /* See above comment why skip locking here. */
+ qemu_mutex_lock_iothread();
+ }
+
+ return ret;
+}
+
+/*
+ * Read message from the ioc QIOChannel.
+ * This function is safe to call from:
+ * - From main loop in co-routine context. Will block the main loop if not in
+ * co-routine context;
+ * - From vCPU thread with no co-routine context and if the channel is not part
+ * of the main loop handling;
+ * - From IOThread within co-routine context, outside of co-routine context
+ * will block IOThread;
+ */
+static ssize_t mpqemu_read(QIOChannel *ioc, void *buf, size_t len, int **fds,
+ size_t *nfds, Error **errp)
+{
+ ERRP_GUARD();
+ struct iovec iov = { .iov_base = buf, .iov_len = len };
+ bool iolock = qemu_mutex_iothread_locked();
+ bool iothread = qemu_in_iothread();
+ int ret = -1;
+
+ /*
+ * Dont use in IOThread out of co-routine context as
+ * it will block IOThread.
+ */
+ assert(qemu_in_coroutine() || !iothread);
+
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ qemu_mutex_unlock_iothread();
+ }
+
+ ret = qio_channel_readv_full_all_eof(ioc, &iov, 1, fds, nfds, errp);
+
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ qemu_mutex_lock_iothread();
+ }
+
+ return (ret <= 0) ? ret : iov.iov_len;
+}
+
+bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
+{
+ ERRP_GUARD();
+ g_autofree int *fds = NULL;
+ size_t nfds = 0;
+ ssize_t len;
+ bool ret = false;
+
+ len = mpqemu_read(ioc, msg, MPQEMU_MSG_HDR_SIZE, &fds, &nfds, errp);
+ if (len <= 0) {
+ goto fail;
+ } else if (len != MPQEMU_MSG_HDR_SIZE) {
+ error_setg(errp, "Message header corrupted");
+ goto fail;
+ }
+
+ if (msg->size > sizeof(msg->data)) {
+ error_setg(errp, "Invalid size for message");
+ goto fail;
+ }
+
+ if (!msg->size) {
+ goto copy_fds;
+ }
+
+ len = mpqemu_read(ioc, &msg->data, msg->size, NULL, NULL, errp);
+ if (len <= 0) {
+ goto fail;
+ }
+ if (len != msg->size) {
+ error_setg(errp, "Unable to read full message");
+ goto fail;
+ }
+
+copy_fds:
+ msg->num_fds = nfds;
+ if (nfds > G_N_ELEMENTS(msg->fds)) {
+ error_setg(errp,
+ "Overflow error: received %zu fds, more than max of %d fds",
+ nfds, REMOTE_MAX_FDS);
+ goto fail;
+ }
+ if (nfds) {
+ memcpy(msg->fds, fds, nfds * sizeof(int));
+ }
+
+ ret = true;
+
+fail:
+ if (*errp) {
+ trace_mpqemu_recv_io_error(msg->cmd, msg->size, nfds);
+ }
+ while (*errp && nfds) {
+ close(fds[nfds - 1]);
+ nfds--;
+ }
+
+ return ret;
+}
+
+/*
+ * Send msg and wait for a reply with command code RET_MSG.
+ * Returns the message received of size u64 or UINT64_MAX
+ * on error.
+ * Called from VCPU thread in non-coroutine context.
+ * Used by the Proxy object to communicate to remote processes.
+ */
+uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
+ Error **errp)
+{
+ ERRP_GUARD();
+ MPQemuMsg msg_reply = {0};
+ uint64_t ret = UINT64_MAX;
+
+ assert(!qemu_in_coroutine());
+
+ QEMU_LOCK_GUARD(&pdev->io_mutex);
+ if (!mpqemu_msg_send(msg, pdev->ioc, errp)) {
+ return ret;
+ }
+
+ if (!mpqemu_msg_recv(&msg_reply, pdev->ioc, errp)) {
+ return ret;
+ }
+
+ if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != MPQEMU_CMD_RET) {
+ error_setg(errp, "ERROR: Invalid reply received for command %d",
+ msg->cmd);
+ return ret;
+ }
+
+ return msg_reply.data.u64;
+}
+
+bool mpqemu_msg_valid(MPQemuMsg *msg)
+{
+ if (msg->cmd >= MPQEMU_CMD_MAX && msg->cmd < 0) {
+ return false;
+ }
+
+ /* Verify FDs. */
+ if (msg->num_fds >= REMOTE_MAX_FDS) {
+ return false;
+ }
+
+ if (msg->num_fds > 0) {
+ for (int i = 0; i < msg->num_fds; i++) {
+ if (fcntl(msg->fds[i], F_GETFL) == -1) {
+ return false;
+ }
+ }
+ }
+
+ /* Verify message specific fields. */
+ switch (msg->cmd) {
+ case MPQEMU_CMD_SYNC_SYSMEM:
+ if (msg->num_fds == 0 || msg->size != sizeof(SyncSysmemMsg)) {
+ return false;
+ }
+ break;
+ case MPQEMU_CMD_PCI_CFGWRITE:
+ case MPQEMU_CMD_PCI_CFGREAD:
+ if (msg->size != sizeof(PciConfDataMsg)) {
+ return false;
+ }
+ break;
+ case MPQEMU_CMD_BAR_WRITE:
+ case MPQEMU_CMD_BAR_READ:
+ if ((msg->size != sizeof(BarAccessMsg)) || (msg->num_fds != 0)) {
+ return false;
+ }
+ break;
+ case MPQEMU_CMD_SET_IRQFD:
+ if (msg->size || (msg->num_fds != 2)) {
+ return false;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return true;
+}
diff --git a/hw/remote/proxy-memory-listener.c b/hw/remote/proxy-memory-listener.c
new file mode 100644
index 0000000..af1fa6f
--- /dev/null
+++ b/hw/remote/proxy-memory-listener.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qemu/compiler.h"
+#include "qemu/int128.h"
+#include "qemu/range.h"
+#include "exec/memory.h"
+#include "exec/cpu-common.h"
+#include "cpu.h"
+#include "exec/ram_addr.h"
+#include "exec/address-spaces.h"
+#include "qapi/error.h"
+#include "hw/remote/mpqemu-link.h"
+#include "hw/remote/proxy-memory-listener.h"
+
+/*
+ * TODO: get_fd_from_hostaddr(), proxy_mrs_can_merge() and
+ * proxy_memory_listener_commit() defined below perform tasks similar to the
+ * functions defined in vhost-user.c. These functions are good candidates
+ * for refactoring.
+ *
+ */
+
+static void proxy_memory_listener_reset(MemoryListener *listener)
+{
+ ProxyMemoryListener *proxy_listener = container_of(listener,
+ ProxyMemoryListener,
+ listener);
+ int mrs;
+
+ for (mrs = 0; mrs < proxy_listener->n_mr_sections; mrs++) {
+ memory_region_unref(proxy_listener->mr_sections[mrs].mr);
+ }
+
+ g_free(proxy_listener->mr_sections);
+ proxy_listener->mr_sections = NULL;
+ proxy_listener->n_mr_sections = 0;
+}
+
+static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset)
+{
+ MemoryRegion *mr;
+ ram_addr_t off;
+
+ /**
+ * Assumes that the host address is a valid address as it's
+ * coming from the MemoryListener system. In the case host
+ * address is not valid, the following call would return
+ * the default subregion of "system_memory" region, and
+ * not NULL. So it's not possible to check for NULL here.
+ */
+ mr = memory_region_from_host((void *)(uintptr_t)host, &off);
+
+ if (offset) {
+ *offset = off;
+ }
+
+ return memory_region_get_fd(mr);
+}
+
+static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size)
+{
+ if (((prev_host + size) != host)) {
+ return false;
+ }
+
+ if (get_fd_from_hostaddr(host, NULL) !=
+ get_fd_from_hostaddr(prev_host, NULL)) {
+ return false;
+ }
+
+ return true;
+}
+
+static bool try_merge(ProxyMemoryListener *proxy_listener,
+ MemoryRegionSection *section)
+{
+ uint64_t mrs_size, mrs_gpa, mrs_page;
+ MemoryRegionSection *prev_sec;
+ bool merged = false;
+ uintptr_t mrs_host;
+ RAMBlock *mrs_rb;
+
+ if (!proxy_listener->n_mr_sections) {
+ return false;
+ }
+
+ mrs_rb = section->mr->ram_block;
+ mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb);
+ mrs_size = int128_get64(section->size);
+ mrs_gpa = section->offset_within_address_space;
+ mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region;
+
+ if (get_fd_from_hostaddr(mrs_host, NULL) < 0) {
+ return true;
+ }
+
+ mrs_host = mrs_host & ~(mrs_page - 1);
+ mrs_gpa = mrs_gpa & ~(mrs_page - 1);
+ mrs_size = ROUND_UP(mrs_size, mrs_page);
+
+ prev_sec = proxy_listener->mr_sections +
+ (proxy_listener->n_mr_sections - 1);
+ uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
+ uint64_t prev_size = int128_get64(prev_sec->size);
+ uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
+ uint64_t prev_host_start =
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
+ prev_sec->offset_within_region;
+ uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
+
+ if (mrs_gpa <= (prev_gpa_end + 1)) {
+ g_assert(mrs_gpa > prev_gpa_start);
+
+ if ((section->mr == prev_sec->mr) &&
+ proxy_mrs_can_merge(mrs_host, prev_host_start,
+ (mrs_gpa - prev_gpa_start))) {
+ uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
+ merged = true;
+ prev_sec->offset_within_address_space =
+ MIN(prev_gpa_start, mrs_gpa);
+ prev_sec->offset_within_region =
+ MIN(prev_host_start, mrs_host) -
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
+ prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
+ mrs_host));
+ }
+ }
+
+ return merged;
+}
+
+static void proxy_memory_listener_region_addnop(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ ProxyMemoryListener *proxy_listener = container_of(listener,
+ ProxyMemoryListener,
+ listener);
+
+ if (!memory_region_is_ram(section->mr) ||
+ memory_region_is_rom(section->mr)) {
+ return;
+ }
+
+ if (try_merge(proxy_listener, section)) {
+ return;
+ }
+
+ ++proxy_listener->n_mr_sections;
+ proxy_listener->mr_sections = g_renew(MemoryRegionSection,
+ proxy_listener->mr_sections,
+ proxy_listener->n_mr_sections);
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1] = *section;
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1].fv = NULL;
+ memory_region_ref(section->mr);
+}
+
+static void proxy_memory_listener_commit(MemoryListener *listener)
+{
+ ProxyMemoryListener *proxy_listener = container_of(listener,
+ ProxyMemoryListener,
+ listener);
+ MPQemuMsg msg;
+ MemoryRegionSection *section;
+ ram_addr_t offset;
+ uintptr_t host_addr;
+ int region;
+ Error *local_err = NULL;
+
+ memset(&msg, 0, sizeof(MPQemuMsg));
+
+ msg.cmd = MPQEMU_CMD_SYNC_SYSMEM;
+ msg.num_fds = proxy_listener->n_mr_sections;
+ msg.size = sizeof(SyncSysmemMsg);
+ if (msg.num_fds > REMOTE_MAX_FDS) {
+ error_report("Number of fds is more than %d", REMOTE_MAX_FDS);
+ return;
+ }
+
+ for (region = 0; region < proxy_listener->n_mr_sections; region++) {
+ section = &proxy_listener->mr_sections[region];
+ msg.data.sync_sysmem.gpas[region] =
+ section->offset_within_address_space;
+ msg.data.sync_sysmem.sizes[region] = int128_get64(section->size);
+ host_addr = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region;
+ msg.fds[region] = get_fd_from_hostaddr(host_addr, &offset);
+ msg.data.sync_sysmem.offsets[region] = offset;
+ }
+ if (!mpqemu_msg_send(&msg, proxy_listener->ioc, &local_err)) {
+ error_report_err(local_err);
+ }
+}
+
+void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener)
+{
+ memory_listener_unregister(&proxy_listener->listener);
+
+ proxy_memory_listener_reset(&proxy_listener->listener);
+}
+
+void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
+ QIOChannel *ioc)
+{
+ proxy_listener->n_mr_sections = 0;
+ proxy_listener->mr_sections = NULL;
+
+ proxy_listener->ioc = ioc;
+
+ proxy_listener->listener.begin = proxy_memory_listener_reset;
+ proxy_listener->listener.commit = proxy_memory_listener_commit;
+ proxy_listener->listener.region_add = proxy_memory_listener_region_addnop;
+ proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop;
+ proxy_listener->listener.priority = 10;
+
+ memory_listener_register(&proxy_listener->listener,
+ &address_space_memory);
+}
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
new file mode 100644
index 0000000..4fa4be0
--- /dev/null
+++ b/hw/remote/proxy.c
@@ -0,0 +1,379 @@
+/*
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/proxy.h"
+#include "hw/pci/pci.h"
+#include "qapi/error.h"
+#include "io/channel-util.h"
+#include "hw/qdev-properties.h"
+#include "monitor/monitor.h"
+#include "migration/blocker.h"
+#include "qemu/sockets.h"
+#include "hw/remote/mpqemu-link.h"
+#include "qemu/error-report.h"
+#include "hw/remote/proxy-memory-listener.h"
+#include "qom/object.h"
+#include "qemu/event_notifier.h"
+#include "sysemu/kvm.h"
+#include "util/event_notifier-posix.c"
+
+static void probe_pci_info(PCIDevice *dev, Error **errp);
+static void proxy_device_reset(DeviceState *dev);
+
+static void proxy_intx_update(PCIDevice *pci_dev)
+{
+ PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
+ PCIINTxRoute route;
+ int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
+
+ if (dev->virq != -1) {
+ kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &dev->intr, dev->virq);
+ dev->virq = -1;
+ }
+
+ route = pci_device_route_intx_to_irq(pci_dev, pin);
+
+ dev->virq = route.irq;
+
+ if (dev->virq != -1) {
+ kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &dev->intr,
+ &dev->resample, dev->virq);
+ }
+}
+
+static void setup_irqfd(PCIProxyDev *dev)
+{
+ PCIDevice *pci_dev = PCI_DEVICE(dev);
+ MPQemuMsg msg;
+ Error *local_err = NULL;
+
+ event_notifier_init(&dev->intr, 0);
+ event_notifier_init(&dev->resample, 0);
+
+ memset(&msg, 0, sizeof(MPQemuMsg));
+ msg.cmd = MPQEMU_CMD_SET_IRQFD;
+ msg.num_fds = 2;
+ msg.fds[0] = event_notifier_get_fd(&dev->intr);
+ msg.fds[1] = event_notifier_get_fd(&dev->resample);
+ msg.size = 0;
+
+ if (!mpqemu_msg_send(&msg, dev->ioc, &local_err)) {
+ error_report_err(local_err);
+ }
+
+ dev->virq = -1;
+
+ proxy_intx_update(pci_dev);
+
+ pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update);
+}
+
+static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
+{
+ ERRP_GUARD();
+ PCIProxyDev *dev = PCI_PROXY_DEV(device);
+ uint8_t *pci_conf = device->config;
+ int fd;
+
+ if (!dev->fd) {
+ error_setg(errp, "fd parameter not specified for %s",
+ DEVICE(device)->id);
+ return;
+ }
+
+ fd = monitor_fd_param(monitor_cur(), dev->fd, errp);
+ if (fd == -1) {
+ error_prepend(errp, "proxy: unable to parse fd %s: ", dev->fd);
+ return;
+ }
+
+ if (!fd_is_socket(fd)) {
+ error_setg(errp, "proxy: fd %d is not a socket", fd);
+ close(fd);
+ return;
+ }
+
+ dev->ioc = qio_channel_new_fd(fd, errp);
+
+ error_setg(&dev->migration_blocker, "%s does not support migration",
+ TYPE_PCI_PROXY_DEV);
+ migrate_add_blocker(dev->migration_blocker, errp);
+
+ qemu_mutex_init(&dev->io_mutex);
+ qio_channel_set_blocking(dev->ioc, true, NULL);
+
+ pci_conf[PCI_LATENCY_TIMER] = 0xff;
+ pci_conf[PCI_INTERRUPT_PIN] = 0x01;
+
+ proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
+
+ setup_irqfd(dev);
+
+ probe_pci_info(PCI_DEVICE(dev), errp);
+}
+
+static void pci_proxy_dev_exit(PCIDevice *pdev)
+{
+ PCIProxyDev *dev = PCI_PROXY_DEV(pdev);
+
+ if (dev->ioc) {
+ qio_channel_close(dev->ioc, NULL);
+ }
+
+ migrate_del_blocker(dev->migration_blocker);
+
+ error_free(dev->migration_blocker);
+
+ proxy_memory_listener_deconfigure(&dev->proxy_listener);
+
+ event_notifier_cleanup(&dev->intr);
+ event_notifier_cleanup(&dev->resample);
+}
+
+static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
+ int len, unsigned int op)
+{
+ MPQemuMsg msg = { 0 };
+ uint64_t ret = -EINVAL;
+ Error *local_err = NULL;
+
+ msg.cmd = op;
+ msg.data.pci_conf_data.addr = addr;
+ msg.data.pci_conf_data.val = (op == MPQEMU_CMD_PCI_CFGWRITE) ? *val : 0;
+ msg.data.pci_conf_data.len = len;
+ msg.size = sizeof(PciConfDataMsg);
+
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+ if (ret == UINT64_MAX) {
+ error_report("Failed to perform PCI config %s operation",
+ (op == MPQEMU_CMD_PCI_CFGREAD) ? "READ" : "WRITE");
+ }
+
+ if (op == MPQEMU_CMD_PCI_CFGREAD) {
+ *val = (uint32_t)ret;
+ }
+}
+
+static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len)
+{
+ uint32_t val;
+
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGREAD);
+
+ return val;
+}
+
+static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val,
+ int len)
+{
+ /*
+ * Some of the functions access the copy of remote device's PCI config
+ * space which is cached in the proxy device. Therefore, maintain
+ * it updated.
+ */
+ pci_default_write_config(d, addr, val, len);
+
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGWRITE);
+}
+
+static Property proxy_properties[] = {
+ DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+ k->realize = pci_proxy_dev_realize;
+ k->exit = pci_proxy_dev_exit;
+ k->config_read = pci_proxy_read_config;
+ k->config_write = pci_proxy_write_config;
+
+ dc->reset = proxy_device_reset;
+
+ device_class_set_props(dc, proxy_properties);
+}
+
+static const TypeInfo pci_proxy_dev_type_info = {
+ .name = TYPE_PCI_PROXY_DEV,
+ .parent = TYPE_PCI_DEVICE,
+ .instance_size = sizeof(PCIProxyDev),
+ .class_init = pci_proxy_dev_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { },
+ },
+};
+
+static void pci_proxy_dev_register_types(void)
+{
+ type_register_static(&pci_proxy_dev_type_info);
+}
+
+type_init(pci_proxy_dev_register_types)
+
+static void send_bar_access_msg(PCIProxyDev *pdev, MemoryRegion *mr,
+ bool write, hwaddr addr, uint64_t *val,
+ unsigned size, bool memory)
+{
+ MPQemuMsg msg = { 0 };
+ long ret = -EINVAL;
+ Error *local_err = NULL;
+
+ msg.size = sizeof(BarAccessMsg);
+ msg.data.bar_access.addr = mr->addr + addr;
+ msg.data.bar_access.size = size;
+ msg.data.bar_access.memory = memory;
+
+ if (write) {
+ msg.cmd = MPQEMU_CMD_BAR_WRITE;
+ msg.data.bar_access.val = *val;
+ } else {
+ msg.cmd = MPQEMU_CMD_BAR_READ;
+ }
+
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+ if (!write) {
+ *val = ret;
+ }
+}
+
+static void proxy_bar_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned size)
+{
+ ProxyMemoryRegion *pmr = opaque;
+
+ send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size,
+ pmr->memory);
+}
+
+static uint64_t proxy_bar_read(void *opaque, hwaddr addr, unsigned size)
+{
+ ProxyMemoryRegion *pmr = opaque;
+ uint64_t val;
+
+ send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size,
+ pmr->memory);
+
+ return val;
+}
+
+const MemoryRegionOps proxy_mr_ops = {
+ .read = proxy_bar_read,
+ .write = proxy_bar_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ },
+};
+
+static void probe_pci_info(PCIDevice *dev, Error **errp)
+{
+ PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
+ uint32_t orig_val, new_val, base_class, val;
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
+ DeviceClass *dc = DEVICE_CLASS(pc);
+ uint8_t type;
+ int i, size;
+
+ config_op_send(pdev, PCI_VENDOR_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->vendor_id = (uint16_t)val;
+
+ config_op_send(pdev, PCI_DEVICE_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->device_id = (uint16_t)val;
+
+ config_op_send(pdev, PCI_CLASS_DEVICE, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->class_id = (uint16_t)val;
+
+ config_op_send(pdev, PCI_SUBSYSTEM_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->subsystem_id = (uint16_t)val;
+
+ base_class = pc->class_id >> 4;
+ switch (base_class) {
+ case PCI_BASE_CLASS_BRIDGE:
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+ break;
+ case PCI_BASE_CLASS_STORAGE:
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ break;
+ case PCI_BASE_CLASS_NETWORK:
+ set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+ break;
+ case PCI_BASE_CLASS_INPUT:
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+ break;
+ case PCI_BASE_CLASS_DISPLAY:
+ set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
+ break;
+ case PCI_BASE_CLASS_PROCESSOR:
+ set_bit(DEVICE_CATEGORY_CPU, dc->categories);
+ break;
+ default:
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ break;
+ }
+
+ for (i = 0; i < PCI_NUM_REGIONS; i++) {
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
+ MPQEMU_CMD_PCI_CFGREAD);
+ new_val = 0xffffffff;
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
+ MPQEMU_CMD_PCI_CFGWRITE);
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
+ MPQEMU_CMD_PCI_CFGREAD);
+ size = (~(new_val & 0xFFFFFFF0)) + 1;
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
+ MPQEMU_CMD_PCI_CFGWRITE);
+ type = (new_val & 0x1) ?
+ PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY;
+
+ if (size) {
+ g_autofree char *name;
+ pdev->region[i].dev = pdev;
+ pdev->region[i].present = true;
+ if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+ pdev->region[i].memory = true;
+ }
+ name = g_strdup_printf("bar-region-%d", i);
+ memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
+ &proxy_mr_ops, &pdev->region[i],
+ name, size);
+ pci_register_bar(dev, i, type, &pdev->region[i].mr);
+ }
+ }
+}
+
+static void proxy_device_reset(DeviceState *dev)
+{
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
+ MPQemuMsg msg = { 0 };
+ Error *local_err = NULL;
+
+ msg.cmd = MPQEMU_CMD_DEVICE_RESET;
+ msg.size = 0;
+
+ mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+}
diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c
new file mode 100644
index 0000000..4f21254
--- /dev/null
+++ b/hw/remote/remote-obj.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
+ *
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qemu/error-report.h"
+#include "qemu/notify.h"
+#include "qom/object_interfaces.h"
+#include "hw/qdev-core.h"
+#include "io/channel.h"
+#include "hw/qdev-core.h"
+#include "hw/remote/machine.h"
+#include "io/channel-util.h"
+#include "qapi/error.h"
+#include "sysemu/sysemu.h"
+#include "hw/pci/pci.h"
+#include "qemu/sockets.h"
+#include "monitor/monitor.h"
+
+#define TYPE_REMOTE_OBJECT "x-remote-object"
+OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT)
+
+struct RemoteObjectClass {
+ ObjectClass parent_class;
+
+ unsigned int nr_devs;
+ unsigned int max_devs;
+};
+
+struct RemoteObject {
+ /* private */
+ Object parent;
+
+ Notifier machine_done;
+
+ int32_t fd;
+ char *devid;
+
+ QIOChannel *ioc;
+
+ DeviceState *dev;
+ DeviceListener listener;
+};
+
+static void remote_object_set_fd(Object *obj, const char *str, Error **errp)
+{
+ RemoteObject *o = REMOTE_OBJECT(obj);
+ int fd = -1;
+
+ fd = monitor_fd_param(monitor_cur(), str, errp);
+ if (fd == -1) {
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
+ return;
+ }
+
+ if (!fd_is_socket(fd)) {
+ error_setg(errp, "File descriptor '%s' is not a socket", str);
+ close(fd);
+ return;
+ }
+
+ o->fd = fd;
+}
+
+static void remote_object_set_devid(Object *obj, const char *str, Error **errp)
+{
+ RemoteObject *o = REMOTE_OBJECT(obj);
+
+ g_free(o->devid);
+
+ o->devid = g_strdup(str);
+}
+
+static void remote_object_unrealize_listener(DeviceListener *listener,
+ DeviceState *dev)
+{
+ RemoteObject *o = container_of(listener, RemoteObject, listener);
+
+ if (o->dev == dev) {
+ object_unref(OBJECT(o));
+ }
+}
+
+static void remote_object_machine_done(Notifier *notifier, void *data)
+{
+ RemoteObject *o = container_of(notifier, RemoteObject, machine_done);
+ DeviceState *dev = NULL;
+ QIOChannel *ioc = NULL;
+ Coroutine *co = NULL;
+ RemoteCommDev *comdev = NULL;
+ Error *err = NULL;
+
+ dev = qdev_find_recursive(sysbus_get_default(), o->devid);
+ if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+ error_report("%s is not a PCI device", o->devid);
+ return;
+ }
+
+ ioc = qio_channel_new_fd(o->fd, &err);
+ if (!ioc) {
+ error_report_err(err);
+ return;
+ }
+ qio_channel_set_blocking(ioc, false, NULL);
+
+ o->dev = dev;
+
+ o->listener.unrealize = remote_object_unrealize_listener;
+ device_listener_register(&o->listener);
+
+ /* co-routine should free this. */
+ comdev = g_new0(RemoteCommDev, 1);
+ *comdev = (RemoteCommDev) {
+ .ioc = ioc,
+ .dev = PCI_DEVICE(dev),
+ };
+
+ co = qemu_coroutine_create(mpqemu_remote_msg_loop_co, comdev);
+ qemu_coroutine_enter(co);
+}
+
+static void remote_object_init(Object *obj)
+{
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
+ RemoteObject *o = REMOTE_OBJECT(obj);
+
+ if (k->nr_devs >= k->max_devs) {
+ error_report("Reached maximum number of devices: %u", k->max_devs);
+ return;
+ }
+
+ o->ioc = NULL;
+ o->fd = -1;
+ o->devid = NULL;
+
+ k->nr_devs++;
+
+ o->machine_done.notify = remote_object_machine_done;
+ qemu_add_machine_init_done_notifier(&o->machine_done);
+}
+
+static void remote_object_finalize(Object *obj)
+{
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
+ RemoteObject *o = REMOTE_OBJECT(obj);
+
+ device_listener_unregister(&o->listener);
+
+ if (o->ioc) {
+ qio_channel_shutdown(o->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+ qio_channel_close(o->ioc, NULL);
+ }
+
+ object_unref(OBJECT(o->ioc));
+
+ k->nr_devs--;
+ g_free(o->devid);
+}
+
+static void remote_object_class_init(ObjectClass *klass, void *data)
+{
+ RemoteObjectClass *k = REMOTE_OBJECT_CLASS(klass);
+
+ /*
+ * Limit number of supported devices to 1. This is done to avoid devices
+ * from one VM accessing the RAM of another VM. This is done until we
+ * start using separate address spaces for individual devices.
+ */
+ k->max_devs = 1;
+ k->nr_devs = 0;
+
+ object_class_property_add_str(klass, "fd", NULL, remote_object_set_fd);
+ object_class_property_add_str(klass, "devid", NULL,
+ remote_object_set_devid);
+}
+
+static const TypeInfo remote_object_info = {
+ .name = TYPE_REMOTE_OBJECT,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(RemoteObject),
+ .instance_init = remote_object_init,
+ .instance_finalize = remote_object_finalize,
+ .class_size = sizeof(RemoteObjectClass),
+ .class_init = remote_object_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&remote_object_info);
+}
+
+type_init(register_types);
diff --git a/hw/remote/trace-events b/hw/remote/trace-events
new file mode 100644
index 0000000..0b23974
--- /dev/null
+++ b/hw/remote/trace-events
@@ -0,0 +1,4 @@
+# multi-process trace events
+
+mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process"
+mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process"
diff --git a/hw/remote/trace.h b/hw/remote/trace.h
new file mode 100644
index 0000000..5d5e3ac
--- /dev/null
+++ b/hw/remote/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_remote.h"