aboutsummaryrefslogtreecommitdiff
path: root/hw
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2023-03-02 16:13:45 +0000
committerPeter Maydell <peter.maydell@linaro.org>2023-03-02 16:13:45 +0000
commitc61d1a066cb6cf90662c82d0e35660fc0ccacbaf (patch)
tree731e3a9a7319be047824ff8d87445ad5d343f950 /hw
parent262312d7ba6e2966acedb4f9c134fd19176b4083 (diff)
parent526947e496e4447d74b8d42415e2847481c5043d (diff)
downloadqemu-c61d1a066cb6cf90662c82d0e35660fc0ccacbaf.zip
qemu-c61d1a066cb6cf90662c82d0e35660fc0ccacbaf.tar.gz
qemu-c61d1a066cb6cf90662c82d0e35660fc0ccacbaf.tar.bz2
Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging
* bugfixes * show machine ACPI support in QAPI * Core Xen emulation support for KVM/x86 # -----BEGIN PGP SIGNATURE----- # # iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmQAlrYUHHBib256aW5p # QHJlZGhhdC5jb20ACgkQv/vSX3jHroONWwf/fxDUMcZUvvatNxiVMhNfqEt/cL0F # Durv1PmbbeVh9PP0W7XFkEXO3LCIRDyR4rtmCs7gHGdmzDOWQ+QIWgQijQ/y7ElQ # bTVsvs0+s/6H3csP3dJTJaXSHshbQvrAZTsyk5KcAB6xdL1KqulfLUoGvXJhAmRs # NKZN8un+nuAhFhL0VBWA9eQaP+BVHQI5ItAj8PaoBby4+Q9fNnat6j1/G4iLly8J # dxIwCnuRHLiB3melWtadwbv6ddLJFeZNa50HUIsynqoItTzmRVr+oXz1yfq087dB # 9uksmoqb+icGEdwqs0iYbQ/dhVnIrMDpn/n2Us28S5VdIMVvxr1JEbEkSQ== # =0jY8 # -----END PGP SIGNATURE----- # gpg: Signature made Thu 02 Mar 2023 12:29:42 GMT # gpg: using RSA key F13338574B662389866C7682BFFBD25F78C7AE83 # gpg: issuer "pbonzini@redhat.com" # gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full] # gpg: aka "Paolo Bonzini <pbonzini@redhat.com>" [full] # Primary key fingerprint: 46F5 9FBD 57D6 12E7 BFD4 E2F7 7E15 100C CD36 69B1 # Subkey fingerprint: F133 3857 4B66 2389 866C 7682 BFFB D25F 78C7 AE83 * tag 'for-upstream' of https://gitlab.com/bonzini/qemu: (62 commits) Makefile: qemu-bundle is a directory qapi: Add 'acpi' field to 'query-machines' output hw/xen: Subsume xen_be_register_common() into xen_be_init() i386/xen: Document Xen HVM emulation kvm/i386: Add xen-evtchn-max-pirq property hw/xen: Support MSI mapping to PIRQ hw/xen: Support GSI mapping to PIRQ hw/xen: Implement emulated PIRQ hypercall support i386/xen: Implement HYPERVISOR_physdev_op hw/xen: Automatically add xen-platform PCI device for emulated Xen guests hw/xen: Add basic ring handling to xenstore hw/xen: Add xen_xenstore device for xenstore emulation hw/xen: Add backend implementation of interdomain event channel support i386/xen: handle HVMOP_get_param i386/xen: Reserve Xen special pages for console, xenstore rings i386/xen: handle PV timer hypercalls hw/xen: Implement GNTTABOP_query_size i386/xen: Implement HYPERVISOR_grant_table_op and GNTTABOP_[gs]et_verson hw/xen: Support mapping grant frames hw/xen: Add xen_gnttab device for grant table emulation ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r--hw/Kconfig1
-rw-r--r--hw/core/machine-qmp-cmds.c1
-rw-r--r--hw/i386/Kconfig5
-rw-r--r--hw/i386/kvm/meson.build13
-rw-r--r--hw/i386/kvm/trace-events5
-rw-r--r--hw/i386/kvm/trace.h1
-rw-r--r--hw/i386/kvm/xen-stubs.c44
-rw-r--r--hw/i386/kvm/xen_evtchn.c2341
-rw-r--r--hw/i386/kvm/xen_evtchn.h88
-rw-r--r--hw/i386/kvm/xen_gnttab.c232
-rw-r--r--hw/i386/kvm/xen_gnttab.h25
-rw-r--r--hw/i386/kvm/xen_overlay.c272
-rw-r--r--hw/i386/kvm/xen_overlay.h26
-rw-r--r--hw/i386/kvm/xen_xenstore.c500
-rw-r--r--hw/i386/kvm/xen_xenstore.h20
-rw-r--r--hw/i386/pc.c26
-rw-r--r--hw/i386/x86.c16
-rw-r--r--hw/i386/xen/meson.build5
-rw-r--r--hw/i386/xen/xen-hvm.c8
-rw-r--r--hw/i386/xen/xen_platform.c57
-rw-r--r--hw/pci/msi.c11
-rw-r--r--hw/pci/msix.c9
-rw-r--r--hw/pci/pci.c17
-rw-r--r--hw/xen/Kconfig3
-rw-r--r--hw/xen/xen-legacy-backend.c56
-rw-r--r--hw/xenpv/xen_machine_pv.c6
26 files changed, 3725 insertions, 63 deletions
diff --git a/hw/Kconfig b/hw/Kconfig
index 38233bb..ba62ff6 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -41,6 +41,7 @@ source tpm/Kconfig
source usb/Kconfig
source virtio/Kconfig
source vfio/Kconfig
+source xen/Kconfig
source watchdog/Kconfig
# arch Kconfig
diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c
index 2d90474..b98ff15 100644
--- a/hw/core/machine-qmp-cmds.c
+++ b/hw/core/machine-qmp-cmds.c
@@ -102,6 +102,7 @@ MachineInfoList *qmp_query_machines(Error **errp)
info->hotpluggable_cpus = mc->has_hotpluggable_cpus;
info->numa_mem_supported = mc->numa_mem_supported;
info->deprecated = !!mc->deprecation_reason;
+ info->acpi = !!object_class_property_find(OBJECT_CLASS(mc), "acpi");
if (mc->default_cpu_type) {
info->default_cpu_type = g_strdup(mc->default_cpu_type);
}
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 9fbfe74..d40802d 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -136,3 +136,8 @@ config VMPORT
config VMMOUSE
bool
depends on VMPORT
+
+config XEN_EMU
+ bool
+ default y
+ depends on KVM && (I386 || X86_64)
diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 95467f1..82dd6ae 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -4,5 +4,18 @@ i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c'))
i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c'))
i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c'))
i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
+i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
+ 'xen_overlay.c',
+ 'xen_evtchn.c',
+ 'xen_gnttab.c',
+ 'xen_xenstore.c',
+ ))
i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
+
+xen_stubs_ss = ss.source_set()
+xen_stubs_ss.add(when: 'CONFIG_XEN_EMU', if_false: files(
+ 'xen-stubs.c',
+))
+
+specific_ss.add_all(when: 'CONFIG_SOFTMMU', if_true: xen_stubs_ss)
diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events
new file mode 100644
index 0000000..b83c3eb
--- /dev/null
+++ b/hw/i386/kvm/trace-events
@@ -0,0 +1,5 @@
+kvm_xen_map_pirq(int pirq, int gsi) "pirq %d gsi %d"
+kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d"
+kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d"
+kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d"
+kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d"
diff --git a/hw/i386/kvm/trace.h b/hw/i386/kvm/trace.h
new file mode 100644
index 0000000..e55d081
--- /dev/null
+++ b/hw/i386/kvm/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_i386_kvm.h"
diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c
new file mode 100644
index 0000000..ae406e0
--- /dev/null
+++ b/hw/i386/kvm/xen-stubs.c
@@ -0,0 +1,44 @@
+/*
+ * QEMU Xen emulation: QMP stubs
+ *
+ * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc-target.h"
+
+#include "xen_evtchn.h"
+
+void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector,
+ uint64_t addr, uint32_t data, bool is_masked)
+{
+}
+
+void xen_evtchn_remove_pci_device(PCIDevice *dev)
+{
+}
+
+bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data)
+{
+ return false;
+}
+
+#ifdef TARGET_I386
+EvtchnInfoList *qmp_xen_event_list(Error **errp)
+{
+ error_setg(errp, "Xen event channel emulation not enabled");
+ return NULL;
+}
+
+void qmp_xen_event_inject(uint32_t port, Error **errp)
+{
+ error_setg(errp, "Xen event channel emulation not enabled");
+}
+#endif
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
new file mode 100644
index 0000000..886fbf6
--- /dev/null
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -0,0 +1,2341 @@
+/*
+ * QEMU Xen emulation: Event channel support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/lockable.h"
+#include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "monitor/monitor.h"
+#include "monitor/hmp.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc-target.h"
+#include "qapi/qmp/qdict.h"
+#include "qom/object.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "migration/vmstate.h"
+#include "trace.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "hw/irq.h"
+
+#include "xen_evtchn.h"
+#include "xen_overlay.h"
+#include "xen_xenstore.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+#include <linux/kvm.h>
+#include <sys/eventfd.h>
+
+#include "hw/xen/interface/memory.h"
+#include "hw/xen/interface/hvm/params.h"
+
+/* XX: For kvm_update_msi_routes_all() */
+#include "target/i386/kvm/kvm_i386.h"
+
+#define TYPE_XEN_EVTCHN "xen-evtchn"
+OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN)
+
+typedef struct XenEvtchnPort {
+ uint32_t vcpu; /* Xen/ACPI vcpu_id */
+ uint16_t type; /* EVTCHNSTAT_xxxx */
+ uint16_t type_val; /* pirq# / virq# / remote port according to type */
+} XenEvtchnPort;
+
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
+struct compat_arch_vcpu_info {
+ unsigned int cr2;
+ unsigned int pad[5];
+};
+
+struct compat_vcpu_info {
+ uint8_t evtchn_upcall_pending;
+ uint8_t evtchn_upcall_mask;
+ uint16_t pad;
+ uint32_t evtchn_pending_sel;
+ struct compat_arch_vcpu_info arch;
+ struct vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct compat_arch_shared_info {
+ unsigned int max_pfn;
+ unsigned int pfn_to_mfn_frame_list_list;
+ unsigned int nmi_reason;
+ unsigned int p2m_cr3;
+ unsigned int p2m_vaddr;
+ unsigned int p2m_generation;
+ uint32_t wc_sec_hi;
+};
+
+struct compat_shared_info {
+ struct compat_vcpu_info vcpu_info[XEN_LEGACY_MAX_VCPUS];
+ uint32_t evtchn_pending[32];
+ uint32_t evtchn_mask[32];
+ uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */
+ uint32_t wc_sec;
+ uint32_t wc_nsec;
+ struct compat_arch_shared_info arch;
+};
+
+#define COMPAT_EVTCHN_2L_NR_CHANNELS 1024
+
+/* Local private implementation of struct xenevtchn_handle */
+struct xenevtchn_handle {
+ evtchn_port_t be_port;
+ evtchn_port_t guest_port; /* Or zero for unbound */
+ int fd;
+};
+
+/*
+ * For unbound/interdomain ports there are only two possible remote
+ * domains; self and QEMU. Use a single high bit in type_val for that,
+ * and the low bits for the remote port number (or 0 for unbound).
+ */
+#define PORT_INFO_TYPEVAL_REMOTE_QEMU 0x8000
+#define PORT_INFO_TYPEVAL_REMOTE_PORT_MASK 0x7FFF
+
+/*
+ * These 'emuirq' values are used by Xen in the LM stream... and yes, I am
+ * insane enough to think about guest-transparent live migration from actual
+ * Xen to QEMU, and ensuring that we can convert/consume the stream.
+ */
+#define IRQ_UNBOUND -1
+#define IRQ_PT -2
+#define IRQ_MSI_EMU -3
+
+
+struct pirq_info {
+ int gsi;
+ uint16_t port;
+ PCIDevice *dev;
+ int vector;
+ bool is_msix;
+ bool is_masked;
+ bool is_translated;
+};
+
+struct XenEvtchnState {
+ /*< private >*/
+ SysBusDevice busdev;
+ /*< public >*/
+
+ uint64_t callback_param;
+ bool evtchn_in_kernel;
+ uint32_t callback_gsi;
+
+ QEMUBH *gsi_bh;
+
+ QemuMutex port_lock;
+ uint32_t nr_ports;
+ XenEvtchnPort port_table[EVTCHN_2L_NR_CHANNELS];
+ qemu_irq gsis[IOAPIC_NUM_PINS];
+
+ struct xenevtchn_handle *be_handles[EVTCHN_2L_NR_CHANNELS];
+
+ uint32_t nr_pirqs;
+
+ /* Bitmap of allocated PIRQs (serialized) */
+ uint16_t nr_pirq_inuse_words;
+ uint64_t *pirq_inuse_bitmap;
+
+ /* GSI → PIRQ mapping (serialized) */
+ uint16_t gsi_pirq[IOAPIC_NUM_PINS];
+
+ /* Per-GSI assertion state (serialized) */
+ uint32_t pirq_gsi_set;
+
+ /* Per-PIRQ information (rebuilt on migration, protected by BQL) */
+ struct pirq_info *pirq;
+};
+
+#define pirq_inuse_word(s, pirq) (s->pirq_inuse_bitmap[((pirq) / 64)])
+#define pirq_inuse_bit(pirq) (1ULL << ((pirq) & 63))
+
+#define pirq_inuse(s, pirq) (pirq_inuse_word(s, pirq) & pirq_inuse_bit(pirq))
+
+struct XenEvtchnState *xen_evtchn_singleton;
+
+/* Top bits of callback_param are the type (HVM_PARAM_CALLBACK_TYPE_xxx) */
+#define CALLBACK_VIA_TYPE_SHIFT 56
+
+static void unbind_backend_ports(XenEvtchnState *s);
+
+static int xen_evtchn_pre_load(void *opaque)
+{
+ XenEvtchnState *s = opaque;
+
+ /* Unbind all the backend-side ports; they need to rebind */
+ unbind_backend_ports(s);
+
+ /* It'll be leaked otherwise. */
+ g_free(s->pirq_inuse_bitmap);
+ s->pirq_inuse_bitmap = NULL;
+
+ return 0;
+}
+
+static int xen_evtchn_post_load(void *opaque, int version_id)
+{
+ XenEvtchnState *s = opaque;
+ uint32_t i;
+
+ if (s->callback_param) {
+ xen_evtchn_set_callback_param(s->callback_param);
+ }
+
+ /* Rebuild s->pirq[].port mapping */
+ for (i = 0; i < s->nr_ports; i++) {
+ XenEvtchnPort *p = &s->port_table[i];
+
+ if (p->type == EVTCHNSTAT_pirq) {
+ assert(p->type_val);
+ assert(p->type_val < s->nr_pirqs);
+
+ /*
+ * Set the gsi to IRQ_UNBOUND; it may be changed to an actual
+ * GSI# below, or to IRQ_MSI_EMU when the MSI table snooping
+ * catches up with it.
+ */
+ s->pirq[p->type_val].gsi = IRQ_UNBOUND;
+ s->pirq[p->type_val].port = i;
+ }
+ }
+ /* Rebuild s->pirq[].gsi mapping */
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ if (s->gsi_pirq[i]) {
+ s->pirq[s->gsi_pirq[i]].gsi = i;
+ }
+ }
+ return 0;
+}
+
+static bool xen_evtchn_is_needed(void *opaque)
+{
+ return xen_mode == XEN_EMULATE;
+}
+
+static const VMStateDescription xen_evtchn_port_vmstate = {
+ .name = "xen_evtchn_port",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(vcpu, XenEvtchnPort),
+ VMSTATE_UINT16(type, XenEvtchnPort),
+ VMSTATE_UINT16(type_val, XenEvtchnPort),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription xen_evtchn_vmstate = {
+ .name = "xen_evtchn",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = xen_evtchn_is_needed,
+ .pre_load = xen_evtchn_pre_load,
+ .post_load = xen_evtchn_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(callback_param, XenEvtchnState),
+ VMSTATE_UINT32(nr_ports, XenEvtchnState),
+ VMSTATE_STRUCT_VARRAY_UINT32(port_table, XenEvtchnState, nr_ports, 1,
+ xen_evtchn_port_vmstate, XenEvtchnPort),
+ VMSTATE_UINT16_ARRAY(gsi_pirq, XenEvtchnState, IOAPIC_NUM_PINS),
+ VMSTATE_VARRAY_UINT16_ALLOC(pirq_inuse_bitmap, XenEvtchnState,
+ nr_pirq_inuse_words, 0,
+ vmstate_info_uint64, uint64_t),
+ VMSTATE_UINT32(pirq_gsi_set, XenEvtchnState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void xen_evtchn_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->vmsd = &xen_evtchn_vmstate;
+}
+
+static const TypeInfo xen_evtchn_info = {
+ .name = TYPE_XEN_EVTCHN,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(XenEvtchnState),
+ .class_init = xen_evtchn_class_init,
+};
+
+static void gsi_assert_bh(void *opaque)
+{
+ struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
+ if (vi) {
+ xen_evtchn_set_callback_level(!!vi->evtchn_upcall_pending);
+ }
+}
+
+void xen_evtchn_create(void)
+{
+ XenEvtchnState *s = XEN_EVTCHN(sysbus_create_simple(TYPE_XEN_EVTCHN,
+ -1, NULL));
+ int i;
+
+ xen_evtchn_singleton = s;
+
+ qemu_mutex_init(&s->port_lock);
+ s->gsi_bh = aio_bh_new(qemu_get_aio_context(), gsi_assert_bh, s);
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ sysbus_init_irq(SYS_BUS_DEVICE(s), &s->gsis[i]);
+ }
+
+ /*
+ * The Xen scheme for encoding PIRQ# into an MSI message is not
+ * compatible with 32-bit MSI, as it puts the high bits of the
+ * PIRQ# into the high bits of the MSI message address, instead of
+ * using the Extended Destination ID in address bits 4-11 which
+ * perhaps would have been a better choice.
+ *
+ * To keep life simple, kvm_accel_instance_init() initialises the
+ * default to 256. which conveniently doesn't need to set anything
+ * outside the low 32 bits of the address. It can be increased by
+ * setting the xen-evtchn-max-pirq property.
+ */
+ s->nr_pirqs = kvm_xen_get_evtchn_max_pirq();
+
+ s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64);
+ s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words);
+ s->pirq = g_new0(struct pirq_info, s->nr_pirqs);
+}
+
+void xen_evtchn_connect_gsis(qemu_irq *system_gsis)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int i;
+
+ if (!s) {
+ return;
+ }
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ sysbus_connect_irq(SYS_BUS_DEVICE(s), i, system_gsis[i]);
+ }
+}
+
+static void xen_evtchn_register_types(void)
+{
+ type_register_static(&xen_evtchn_info);
+}
+
+type_init(xen_evtchn_register_types)
+
+static int set_callback_pci_intx(XenEvtchnState *s, uint64_t param)
+{
+ PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+ uint8_t pin = param & 3;
+ uint8_t devfn = (param >> 8) & 0xff;
+ uint16_t bus = (param >> 16) & 0xffff;
+ uint16_t domain = (param >> 32) & 0xffff;
+ PCIDevice *pdev;
+ PCIINTxRoute r;
+
+ if (domain || !pcms) {
+ return 0;
+ }
+
+ pdev = pci_find_device(pcms->bus, bus, devfn);
+ if (!pdev) {
+ return 0;
+ }
+
+ r = pci_device_route_intx_to_irq(pdev, pin);
+ if (r.mode != PCI_INTX_ENABLED) {
+ return 0;
+ }
+
+ /*
+ * Hm, can we be notified of INTX routing changes? Not without
+ * *owning* the device and being allowed to overwrite its own
+ * ->intx_routing_notifier, AFAICT. So let's not.
+ */
+ return r.irq;
+}
+
+void xen_evtchn_set_callback_level(int level)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ if (!s) {
+ return;
+ }
+
+ /*
+ * We get to this function in a number of ways:
+ *
+ * • From I/O context, via PV backend drivers sending a notification to
+ * the guest.
+ *
+ * • From guest vCPU context, via loopback interdomain event channels
+ * (or theoretically even IPIs but guests don't use those with GSI
+ * delivery because that's pointless. We don't want a malicious guest
+ * to be able to trigger a deadlock though, so we can't rule it out.)
+ *
+ * • From guest vCPU context when the HVM_PARAM_CALLBACK_IRQ is being
+ * configured.
+ *
+ * • From guest vCPU context in the KVM exit handler, if the upcall
+ * pending flag has been cleared and the GSI needs to be deasserted.
+ *
+ * • Maybe in future, in an interrupt ack/eoi notifier when the GSI has
+ * been acked in the irqchip.
+ *
+ * Whichever context we come from if we aren't already holding the BQL
+ * then e can't take it now, as we may already hold s->port_lock. So
+ * trigger the BH to set the IRQ for us instead of doing it immediately.
+ *
+ * In the HVM_PARAM_CALLBACK_IRQ and KVM exit handler cases, the caller
+ * will deliberately take the BQL because they want the change to take
+ * effect immediately. That just leaves interdomain loopback as the case
+ * which uses the BH.
+ */
+ if (!qemu_mutex_iothread_locked()) {
+ qemu_bh_schedule(s->gsi_bh);
+ return;
+ }
+
+ if (s->callback_gsi && s->callback_gsi < IOAPIC_NUM_PINS) {
+ qemu_set_irq(s->gsis[s->callback_gsi], level);
+ if (level) {
+ /* Ensure the vCPU polls for deassertion */
+ kvm_xen_set_callback_asserted();
+ }
+ }
+}
+
+int xen_evtchn_set_callback_param(uint64_t param)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ struct kvm_xen_hvm_attr xa = {
+ .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
+ .u.vector = 0,
+ };
+ bool in_kernel = false;
+ uint32_t gsi = 0;
+ int type = param >> CALLBACK_VIA_TYPE_SHIFT;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ /*
+ * We need the BQL because set_callback_pci_intx() may call into PCI code,
+ * and because we may need to manipulate the old and new GSI levels.
+ */
+ assert(qemu_mutex_iothread_locked());
+ qemu_mutex_lock(&s->port_lock);
+
+ switch (type) {
+ case HVM_PARAM_CALLBACK_TYPE_VECTOR: {
+ xa.u.vector = (uint8_t)param,
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+ if (!ret && kvm_xen_has_cap(EVTCHN_SEND)) {
+ in_kernel = true;
+ }
+ gsi = 0;
+ break;
+ }
+
+ case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
+ gsi = set_callback_pci_intx(s, param);
+ ret = gsi ? 0 : -EINVAL;
+ break;
+
+ case HVM_PARAM_CALLBACK_TYPE_GSI:
+ gsi = (uint32_t)param;
+ ret = 0;
+ break;
+
+ default:
+ /* Xen doesn't return error even if you set something bogus */
+ ret = 0;
+ break;
+ }
+
+ if (!ret) {
+ /* If vector delivery was turned *off* then tell the kernel */
+ if ((s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) ==
+ HVM_PARAM_CALLBACK_TYPE_VECTOR && !xa.u.vector) {
+ kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+ }
+ s->callback_param = param;
+ s->evtchn_in_kernel = in_kernel;
+
+ if (gsi != s->callback_gsi) {
+ struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
+
+ xen_evtchn_set_callback_level(0);
+ s->callback_gsi = gsi;
+
+ if (gsi && vi && vi->evtchn_upcall_pending) {
+ kvm_xen_inject_vcpu_callback_vector(0, type);
+ }
+ }
+ }
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+static void inject_callback(XenEvtchnState *s, uint32_t vcpu)
+{
+ int type = s->callback_param >> CALLBACK_VIA_TYPE_SHIFT;
+
+ kvm_xen_inject_vcpu_callback_vector(vcpu, type);
+}
+
+static void deassign_kernel_port(evtchn_port_t port)
+{
+ struct kvm_xen_hvm_attr ha;
+ int ret;
+
+ ha.type = KVM_XEN_ATTR_TYPE_EVTCHN;
+ ha.u.evtchn.send_port = port;
+ ha.u.evtchn.flags = KVM_XEN_EVTCHN_DEASSIGN;
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha);
+ if (ret) {
+ qemu_log_mask(LOG_GUEST_ERROR, "Failed to unbind kernel port %d: %s\n",
+ port, strerror(ret));
+ }
+}
+
+static int assign_kernel_port(uint16_t type, evtchn_port_t port,
+ uint32_t vcpu_id)
+{
+ CPUState *cpu = qemu_get_cpu(vcpu_id);
+ struct kvm_xen_hvm_attr ha;
+
+ if (!cpu) {
+ return -ENOENT;
+ }
+
+ ha.type = KVM_XEN_ATTR_TYPE_EVTCHN;
+ ha.u.evtchn.send_port = port;
+ ha.u.evtchn.type = type;
+ ha.u.evtchn.flags = 0;
+ ha.u.evtchn.deliver.port.port = port;
+ ha.u.evtchn.deliver.port.vcpu = kvm_arch_vcpu_id(cpu);
+ ha.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha);
+}
+
+static int assign_kernel_eventfd(uint16_t type, evtchn_port_t port, int fd)
+{
+ struct kvm_xen_hvm_attr ha;
+
+ ha.type = KVM_XEN_ATTR_TYPE_EVTCHN;
+ ha.u.evtchn.send_port = port;
+ ha.u.evtchn.type = type;
+ ha.u.evtchn.flags = 0;
+ ha.u.evtchn.deliver.eventfd.port = 0;
+ ha.u.evtchn.deliver.eventfd.fd = fd;
+
+ return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha);
+}
+
+static bool valid_port(evtchn_port_t port)
+{
+ if (!port) {
+ return false;
+ }
+
+ if (xen_is_long_mode()) {
+ return port < EVTCHN_2L_NR_CHANNELS;
+ } else {
+ return port < COMPAT_EVTCHN_2L_NR_CHANNELS;
+ }
+}
+
+static bool valid_vcpu(uint32_t vcpu)
+{
+ return !!qemu_get_cpu(vcpu);
+}
+
+static void unbind_backend_ports(XenEvtchnState *s)
+{
+ XenEvtchnPort *p;
+ int i;
+
+ for (i = 1; i < s->nr_ports; i++) {
+ p = &s->port_table[i];
+ if (p->type == EVTCHNSTAT_interdomain &&
+ (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU)) {
+ evtchn_port_t be_port = p->type_val & PORT_INFO_TYPEVAL_REMOTE_PORT_MASK;
+
+ if (s->be_handles[be_port]) {
+ /* This part will be overwritten on the load anyway. */
+ p->type = EVTCHNSTAT_unbound;
+ p->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+
+ /* Leave the backend port open and unbound too. */
+ if (kvm_xen_has_cap(EVTCHN_SEND)) {
+ deassign_kernel_port(i);
+ }
+ s->be_handles[be_port]->guest_port = 0;
+ }
+ }
+ }
+}
+
+int xen_evtchn_status_op(struct evtchn_status *status)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ XenEvtchnPort *p;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (status->dom != DOMID_SELF && status->dom != xen_domid) {
+ return -ESRCH;
+ }
+
+ if (!valid_port(status->port)) {
+ return -EINVAL;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ p = &s->port_table[status->port];
+
+ status->status = p->type;
+ status->vcpu = p->vcpu;
+
+ switch (p->type) {
+ case EVTCHNSTAT_unbound:
+ if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+ status->u.unbound.dom = DOMID_QEMU;
+ } else {
+ status->u.unbound.dom = xen_domid;
+ }
+ break;
+
+ case EVTCHNSTAT_interdomain:
+ if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+ status->u.interdomain.dom = DOMID_QEMU;
+ } else {
+ status->u.interdomain.dom = xen_domid;
+ }
+
+ status->u.interdomain.port = p->type_val &
+ PORT_INFO_TYPEVAL_REMOTE_PORT_MASK;
+ break;
+
+ case EVTCHNSTAT_pirq:
+ status->u.pirq = p->type_val;
+ break;
+
+ case EVTCHNSTAT_virq:
+ status->u.virq = p->type_val;
+ break;
+ }
+
+ qemu_mutex_unlock(&s->port_lock);
+ return 0;
+}
+
+/*
+ * Never thought I'd hear myself say this, but C++ templates would be
+ * kind of nice here.
+ *
+ * template<class T> static int do_unmask_port(T *shinfo, ...);
+ */
+static int do_unmask_port_lm(XenEvtchnState *s, evtchn_port_t port,
+ bool do_unmask, struct shared_info *shinfo,
+ struct vcpu_info *vcpu_info)
+{
+ const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+ typeof(shinfo->evtchn_pending[0]) mask;
+ int idx = port / bits_per_word;
+ int offset = port % bits_per_word;
+
+ mask = 1UL << offset;
+
+ if (idx >= bits_per_word) {
+ return -EINVAL;
+ }
+
+ if (do_unmask) {
+ /*
+ * If this is a true unmask operation, clear the mask bit. If
+ * it was already unmasked, we have nothing further to do.
+ */
+ if (!((qatomic_fetch_and(&shinfo->evtchn_mask[idx], ~mask) & mask))) {
+ return 0;
+ }
+ } else {
+ /*
+ * This is a pseudo-unmask for affinity changes. We don't
+ * change the mask bit, and if it's *masked* we have nothing
+ * else to do.
+ */
+ if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+ return 0;
+ }
+ }
+
+ /* If the event was not pending, we're done. */
+ if (!(qatomic_fetch_or(&shinfo->evtchn_pending[idx], 0) & mask)) {
+ return 0;
+ }
+
+ /* Now on to the vcpu_info evtchn_pending_sel index... */
+ mask = 1UL << idx;
+
+ /* If a port in this word was already pending for this vCPU, all done. */
+ if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+ return 0;
+ }
+
+ /* Set evtchn_upcall_pending for this vCPU */
+ if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) {
+ return 0;
+ }
+
+ inject_callback(s, s->port_table[port].vcpu);
+
+ return 0;
+}
+
+static int do_unmask_port_compat(XenEvtchnState *s, evtchn_port_t port,
+ bool do_unmask,
+ struct compat_shared_info *shinfo,
+ struct compat_vcpu_info *vcpu_info)
+{
+ const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+ typeof(shinfo->evtchn_pending[0]) mask;
+ int idx = port / bits_per_word;
+ int offset = port % bits_per_word;
+
+ mask = 1UL << offset;
+
+ if (idx >= bits_per_word) {
+ return -EINVAL;
+ }
+
+ if (do_unmask) {
+ /*
+ * If this is a true unmask operation, clear the mask bit. If
+ * it was already unmasked, we have nothing further to do.
+ */
+ if (!((qatomic_fetch_and(&shinfo->evtchn_mask[idx], ~mask) & mask))) {
+ return 0;
+ }
+ } else {
+ /*
+ * This is a pseudo-unmask for affinity changes. We don't
+ * change the mask bit, and if it's *masked* we have nothing
+ * else to do.
+ */
+ if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+ return 0;
+ }
+ }
+
+ /* If the event was not pending, we're done. */
+ if (!(qatomic_fetch_or(&shinfo->evtchn_pending[idx], 0) & mask)) {
+ return 0;
+ }
+
+ /* Now on to the vcpu_info evtchn_pending_sel index... */
+ mask = 1UL << idx;
+
+ /* If a port in this word was already pending for this vCPU, all done. */
+ if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+ return 0;
+ }
+
+ /* Set evtchn_upcall_pending for this vCPU */
+ if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) {
+ return 0;
+ }
+
+ inject_callback(s, s->port_table[port].vcpu);
+
+ return 0;
+}
+
+static int unmask_port(XenEvtchnState *s, evtchn_port_t port, bool do_unmask)
+{
+ void *vcpu_info, *shinfo;
+
+ if (s->port_table[port].type == EVTCHNSTAT_closed) {
+ return -EINVAL;
+ }
+
+ shinfo = xen_overlay_get_shinfo_ptr();
+ if (!shinfo) {
+ return -ENOTSUP;
+ }
+
+ vcpu_info = kvm_xen_get_vcpu_info_hva(s->port_table[port].vcpu);
+ if (!vcpu_info) {
+ return -EINVAL;
+ }
+
+ if (xen_is_long_mode()) {
+ return do_unmask_port_lm(s, port, do_unmask, shinfo, vcpu_info);
+ } else {
+ return do_unmask_port_compat(s, port, do_unmask, shinfo, vcpu_info);
+ }
+}
+
+static int do_set_port_lm(XenEvtchnState *s, evtchn_port_t port,
+ struct shared_info *shinfo,
+ struct vcpu_info *vcpu_info)
+{
+ const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+ typeof(shinfo->evtchn_pending[0]) mask;
+ int idx = port / bits_per_word;
+ int offset = port % bits_per_word;
+
+ mask = 1UL << offset;
+
+ if (idx >= bits_per_word) {
+ return -EINVAL;
+ }
+
+ /* Update the pending bit itself. If it was already set, we're done. */
+ if (qatomic_fetch_or(&shinfo->evtchn_pending[idx], mask) & mask) {
+ return 0;
+ }
+
+ /* Check if it's masked. */
+ if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+ return 0;
+ }
+
+ /* Now on to the vcpu_info evtchn_pending_sel index... */
+ mask = 1UL << idx;
+
+ /* If a port in this word was already pending for this vCPU, all done. */
+ if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+ return 0;
+ }
+
+ /* Set evtchn_upcall_pending for this vCPU */
+ if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) {
+ return 0;
+ }
+
+ inject_callback(s, s->port_table[port].vcpu);
+
+ return 0;
+}
+
+static int do_set_port_compat(XenEvtchnState *s, evtchn_port_t port,
+ struct compat_shared_info *shinfo,
+ struct compat_vcpu_info *vcpu_info)
+{
+ const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+ typeof(shinfo->evtchn_pending[0]) mask;
+ int idx = port / bits_per_word;
+ int offset = port % bits_per_word;
+
+ mask = 1UL << offset;
+
+ if (idx >= bits_per_word) {
+ return -EINVAL;
+ }
+
+ /* Update the pending bit itself. If it was already set, we're done. */
+ if (qatomic_fetch_or(&shinfo->evtchn_pending[idx], mask) & mask) {
+ return 0;
+ }
+
+ /* Check if it's masked. */
+ if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+ return 0;
+ }
+
+ /* Now on to the vcpu_info evtchn_pending_sel index... */
+ mask = 1UL << idx;
+
+ /* If a port in this word was already pending for this vCPU, all done. */
+ if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+ return 0;
+ }
+
+ /* Set evtchn_upcall_pending for this vCPU */
+ if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) {
+ return 0;
+ }
+
+ inject_callback(s, s->port_table[port].vcpu);
+
+ return 0;
+}
+
+static int set_port_pending(XenEvtchnState *s, evtchn_port_t port)
+{
+ void *vcpu_info, *shinfo;
+
+ if (s->port_table[port].type == EVTCHNSTAT_closed) {
+ return -EINVAL;
+ }
+
+ if (s->evtchn_in_kernel) {
+ XenEvtchnPort *p = &s->port_table[port];
+ CPUState *cpu = qemu_get_cpu(p->vcpu);
+ struct kvm_irq_routing_xen_evtchn evt;
+
+ if (!cpu) {
+ return 0;
+ }
+
+ evt.port = port;
+ evt.vcpu = kvm_arch_vcpu_id(cpu);
+ evt.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_EVTCHN_SEND, &evt);
+ }
+
+ shinfo = xen_overlay_get_shinfo_ptr();
+ if (!shinfo) {
+ return -ENOTSUP;
+ }
+
+ vcpu_info = kvm_xen_get_vcpu_info_hva(s->port_table[port].vcpu);
+ if (!vcpu_info) {
+ return -EINVAL;
+ }
+
+ if (xen_is_long_mode()) {
+ return do_set_port_lm(s, port, shinfo, vcpu_info);
+ } else {
+ return do_set_port_compat(s, port, shinfo, vcpu_info);
+ }
+}
+
+static int clear_port_pending(XenEvtchnState *s, evtchn_port_t port)
+{
+ void *p = xen_overlay_get_shinfo_ptr();
+
+ if (!p) {
+ return -ENOTSUP;
+ }
+
+ if (xen_is_long_mode()) {
+ struct shared_info *shinfo = p;
+ const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+ typeof(shinfo->evtchn_pending[0]) mask;
+ int idx = port / bits_per_word;
+ int offset = port % bits_per_word;
+
+ mask = 1UL << offset;
+
+ qatomic_fetch_and(&shinfo->evtchn_pending[idx], ~mask);
+ } else {
+ struct compat_shared_info *shinfo = p;
+ const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+ typeof(shinfo->evtchn_pending[0]) mask;
+ int idx = port / bits_per_word;
+ int offset = port % bits_per_word;
+
+ mask = 1UL << offset;
+
+ qatomic_fetch_and(&shinfo->evtchn_pending[idx], ~mask);
+ }
+ return 0;
+}
+
+static void free_port(XenEvtchnState *s, evtchn_port_t port)
+{
+ s->port_table[port].type = EVTCHNSTAT_closed;
+ s->port_table[port].type_val = 0;
+ s->port_table[port].vcpu = 0;
+
+ if (s->nr_ports == port + 1) {
+ do {
+ s->nr_ports--;
+ } while (s->nr_ports &&
+ s->port_table[s->nr_ports - 1].type == EVTCHNSTAT_closed);
+ }
+
+ /* Clear pending event to avoid unexpected behavior on re-bind. */
+ clear_port_pending(s, port);
+}
+
+static int allocate_port(XenEvtchnState *s, uint32_t vcpu, uint16_t type,
+ uint16_t val, evtchn_port_t *port)
+{
+ evtchn_port_t p = 1;
+
+ for (p = 1; valid_port(p); p++) {
+ if (s->port_table[p].type == EVTCHNSTAT_closed) {
+ s->port_table[p].vcpu = vcpu;
+ s->port_table[p].type = type;
+ s->port_table[p].type_val = val;
+
+ *port = p;
+
+ if (s->nr_ports < p + 1) {
+ s->nr_ports = p + 1;
+ }
+
+ return 0;
+ }
+ }
+ return -ENOSPC;
+}
+
+static bool virq_is_global(uint32_t virq)
+{
+ switch (virq) {
+ case VIRQ_TIMER:
+ case VIRQ_DEBUG:
+ case VIRQ_XENOPROF:
+ case VIRQ_XENPMU:
+ return false;
+
+ default:
+ return true;
+ }
+}
+
+static int close_port(XenEvtchnState *s, evtchn_port_t port,
+ bool *flush_kvm_routes)
+{
+ XenEvtchnPort *p = &s->port_table[port];
+
+ /* Because it *might* be a PIRQ port */
+ assert(qemu_mutex_iothread_locked());
+
+ switch (p->type) {
+ case EVTCHNSTAT_closed:
+ return -ENOENT;
+
+ case EVTCHNSTAT_pirq:
+ s->pirq[p->type_val].port = 0;
+ if (s->pirq[p->type_val].is_translated) {
+ *flush_kvm_routes = true;
+ }
+ break;
+
+ case EVTCHNSTAT_virq:
+ kvm_xen_set_vcpu_virq(virq_is_global(p->type_val) ? 0 : p->vcpu,
+ p->type_val, 0);
+ break;
+
+ case EVTCHNSTAT_ipi:
+ if (s->evtchn_in_kernel) {
+ deassign_kernel_port(port);
+ }
+ break;
+
+ case EVTCHNSTAT_interdomain:
+ if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+ uint16_t be_port = p->type_val & ~PORT_INFO_TYPEVAL_REMOTE_QEMU;
+ struct xenevtchn_handle *xc = s->be_handles[be_port];
+ if (xc) {
+ if (kvm_xen_has_cap(EVTCHN_SEND)) {
+ deassign_kernel_port(port);
+ }
+ xc->guest_port = 0;
+ }
+ } else {
+ /* Loopback interdomain */
+ XenEvtchnPort *rp = &s->port_table[p->type_val];
+ if (!valid_port(p->type_val) || rp->type_val != port ||
+ rp->type != EVTCHNSTAT_interdomain) {
+ error_report("Inconsistent state for interdomain unbind");
+ } else {
+ /* Set the other end back to unbound */
+ rp->type = EVTCHNSTAT_unbound;
+ rp->type_val = 0;
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ free_port(s, port);
+ return 0;
+}
+
+int xen_evtchn_soft_reset(void)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ bool flush_kvm_routes;
+ int i;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ assert(qemu_mutex_iothread_locked());
+
+ qemu_mutex_lock(&s->port_lock);
+
+ for (i = 0; i < s->nr_ports; i++) {
+ close_port(s, i, &flush_kvm_routes);
+ }
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ if (flush_kvm_routes) {
+ kvm_update_msi_routes_all(NULL, true, 0, 0);
+ }
+
+ return 0;
+}
+
+int xen_evtchn_reset_op(struct evtchn_reset *reset)
+{
+ if (reset->dom != DOMID_SELF && reset->dom != xen_domid) {
+ return -ESRCH;
+ }
+
+ return xen_evtchn_soft_reset();
+}
+
+int xen_evtchn_close_op(struct evtchn_close *close)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ bool flush_kvm_routes = false;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (!valid_port(close->port)) {
+ return -EINVAL;
+ }
+
+ QEMU_IOTHREAD_LOCK_GUARD();
+ qemu_mutex_lock(&s->port_lock);
+
+ ret = close_port(s, close->port, &flush_kvm_routes);
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ if (flush_kvm_routes) {
+ kvm_update_msi_routes_all(NULL, true, 0, 0);
+ }
+
+ return ret;
+}
+
+int xen_evtchn_unmask_op(struct evtchn_unmask *unmask)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (!valid_port(unmask->port)) {
+ return -EINVAL;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ ret = unmask_port(s, unmask->port, true);
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+int xen_evtchn_bind_vcpu_op(struct evtchn_bind_vcpu *vcpu)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ XenEvtchnPort *p;
+ int ret = -EINVAL;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (!valid_port(vcpu->port)) {
+ return -EINVAL;
+ }
+
+ if (!valid_vcpu(vcpu->vcpu)) {
+ return -ENOENT;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ p = &s->port_table[vcpu->port];
+
+ if (p->type == EVTCHNSTAT_interdomain ||
+ p->type == EVTCHNSTAT_unbound ||
+ p->type == EVTCHNSTAT_pirq ||
+ (p->type == EVTCHNSTAT_virq && virq_is_global(p->type_val))) {
+ /*
+ * unmask_port() with do_unmask==false will just raise the event
+ * on the new vCPU if the port was already pending.
+ */
+ p->vcpu = vcpu->vcpu;
+ unmask_port(s, vcpu->port, false);
+ ret = 0;
+ }
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (virq->virq >= NR_VIRQS) {
+ return -EINVAL;
+ }
+
+ /* Global VIRQ must be allocated on vCPU0 first */
+ if (virq_is_global(virq->virq) && virq->vcpu != 0) {
+ return -EINVAL;
+ }
+
+ if (!valid_vcpu(virq->vcpu)) {
+ return -ENOENT;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ ret = allocate_port(s, virq->vcpu, EVTCHNSTAT_virq, virq->virq,
+ &virq->port);
+ if (!ret) {
+ ret = kvm_xen_set_vcpu_virq(virq->vcpu, virq->virq, virq->port);
+ if (ret) {
+ free_port(s, virq->port);
+ }
+ }
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+int xen_evtchn_bind_pirq_op(struct evtchn_bind_pirq *pirq)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (pirq->pirq >= s->nr_pirqs) {
+ return -EINVAL;
+ }
+
+ QEMU_IOTHREAD_LOCK_GUARD();
+
+ if (s->pirq[pirq->pirq].port) {
+ return -EBUSY;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ ret = allocate_port(s, 0, EVTCHNSTAT_pirq, pirq->pirq,
+ &pirq->port);
+ if (ret) {
+ qemu_mutex_unlock(&s->port_lock);
+ return ret;
+ }
+
+ s->pirq[pirq->pirq].port = pirq->port;
+ trace_kvm_xen_bind_pirq(pirq->pirq, pirq->port);
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ /*
+ * Need to do the unmask outside port_lock because it may call
+ * back into the MSI translate function.
+ */
+ if (s->pirq[pirq->pirq].gsi == IRQ_MSI_EMU) {
+ if (s->pirq[pirq->pirq].is_masked) {
+ PCIDevice *dev = s->pirq[pirq->pirq].dev;
+ int vector = s->pirq[pirq->pirq].vector;
+ char *dev_path = qdev_get_dev_path(DEVICE(dev));
+
+ trace_kvm_xen_unmask_pirq(pirq->pirq, dev_path, vector);
+ g_free(dev_path);
+
+ if (s->pirq[pirq->pirq].is_msix) {
+ msix_set_mask(dev, vector, false);
+ } else {
+ msi_set_mask(dev, vector, false, NULL);
+ }
+ } else if (s->pirq[pirq->pirq].is_translated) {
+ /*
+ * If KVM had attempted to translate this one before, make it try
+ * again. If we unmasked, then the notifier on the MSI(-X) vector
+ * will already have had the same effect.
+ */
+ kvm_update_msi_routes_all(NULL, true, 0, 0);
+ }
+ }
+
+ return ret;
+}
+
+int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (!valid_vcpu(ipi->vcpu)) {
+ return -ENOENT;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ ret = allocate_port(s, ipi->vcpu, EVTCHNSTAT_ipi, 0, &ipi->port);
+ if (!ret && s->evtchn_in_kernel) {
+ assign_kernel_port(EVTCHNSTAT_ipi, ipi->port, ipi->vcpu);
+ }
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ uint16_t type_val;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (interdomain->remote_dom == DOMID_QEMU) {
+ type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+ } else if (interdomain->remote_dom == DOMID_SELF ||
+ interdomain->remote_dom == xen_domid) {
+ type_val = 0;
+ } else {
+ return -ESRCH;
+ }
+
+ if (!valid_port(interdomain->remote_port)) {
+ return -EINVAL;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ /* The newly allocated port starts out as unbound */
+ ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val,
+ &interdomain->local_port);
+ if (ret) {
+ goto out;
+ }
+
+ if (interdomain->remote_dom == DOMID_QEMU) {
+ struct xenevtchn_handle *xc = s->be_handles[interdomain->remote_port];
+ XenEvtchnPort *lp = &s->port_table[interdomain->local_port];
+
+ if (!xc) {
+ ret = -ENOENT;
+ goto out_free_port;
+ }
+
+ if (xc->guest_port) {
+ ret = -EBUSY;
+ goto out_free_port;
+ }
+
+ assert(xc->be_port == interdomain->remote_port);
+ xc->guest_port = interdomain->local_port;
+ if (kvm_xen_has_cap(EVTCHN_SEND)) {
+ assign_kernel_eventfd(lp->type, xc->guest_port, xc->fd);
+ }
+ lp->type = EVTCHNSTAT_interdomain;
+ lp->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU | interdomain->remote_port;
+ ret = 0;
+ } else {
+ /* Loopback */
+ XenEvtchnPort *rp = &s->port_table[interdomain->remote_port];
+ XenEvtchnPort *lp = &s->port_table[interdomain->local_port];
+
+ if (rp->type == EVTCHNSTAT_unbound && rp->type_val == 0) {
+ /* It's a match! */
+ rp->type = EVTCHNSTAT_interdomain;
+ rp->type_val = interdomain->local_port;
+
+ lp->type = EVTCHNSTAT_interdomain;
+ lp->type_val = interdomain->remote_port;
+ } else {
+ ret = -EINVAL;
+ }
+ }
+
+ out_free_port:
+ if (ret) {
+ free_port(s, interdomain->local_port);
+ }
+ out:
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+
+}
+int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ uint16_t type_val;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (alloc->dom != DOMID_SELF && alloc->dom != xen_domid) {
+ return -ESRCH;
+ }
+
+ if (alloc->remote_dom == DOMID_QEMU) {
+ type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+ } else if (alloc->remote_dom == DOMID_SELF ||
+ alloc->remote_dom == xen_domid) {
+ type_val = 0;
+ } else {
+ return -EPERM;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val, &alloc->port);
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+int xen_evtchn_send_op(struct evtchn_send *send)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ XenEvtchnPort *p;
+ int ret = 0;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (!valid_port(send->port)) {
+ return -EINVAL;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ p = &s->port_table[send->port];
+
+ switch (p->type) {
+ case EVTCHNSTAT_interdomain:
+ if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+ /*
+ * This is an event from the guest to qemu itself, which is
+ * serving as the driver domain.
+ */
+ uint16_t be_port = p->type_val & ~PORT_INFO_TYPEVAL_REMOTE_QEMU;
+ struct xenevtchn_handle *xc = s->be_handles[be_port];
+ if (xc) {
+ eventfd_write(xc->fd, 1);
+ ret = 0;
+ } else {
+ ret = -ENOENT;
+ }
+ } else {
+ /* Loopback interdomain ports; just a complex IPI */
+ set_port_pending(s, p->type_val);
+ }
+ break;
+
+ case EVTCHNSTAT_ipi:
+ set_port_pending(s, send->port);
+ break;
+
+ case EVTCHNSTAT_unbound:
+ /* Xen will silently drop these */
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+int xen_evtchn_set_port(uint16_t port)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ XenEvtchnPort *p;
+ int ret = -EINVAL;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (!valid_port(port)) {
+ return -EINVAL;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ p = &s->port_table[port];
+
+ /* QEMU has no business sending to anything but these */
+ if (p->type == EVTCHNSTAT_virq ||
+ (p->type == EVTCHNSTAT_interdomain &&
+ (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU))) {
+ set_port_pending(s, port);
+ ret = 0;
+ }
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+static int allocate_pirq(XenEvtchnState *s, int type, int gsi)
+{
+ uint16_t pirq;
+
+ /*
+ * Preserve the allocation strategy that Xen has. It looks like
+ * we *never* give out PIRQ 0-15, we give out 16-nr_irqs_gsi only
+ * to GSIs (counting up from 16), and then we count backwards from
+ * the top for MSIs or when the GSI space is exhausted.
+ */
+ if (type == MAP_PIRQ_TYPE_GSI) {
+ for (pirq = 16 ; pirq < IOAPIC_NUM_PINS; pirq++) {
+ if (pirq_inuse(s, pirq)) {
+ continue;
+ }
+
+ /* Found it */
+ goto found;
+ }
+ }
+ for (pirq = s->nr_pirqs - 1; pirq >= IOAPIC_NUM_PINS; pirq--) {
+ /* Skip whole words at a time when they're full */
+ if (pirq_inuse_word(s, pirq) == UINT64_MAX) {
+ pirq &= ~63ULL;
+ continue;
+ }
+ if (pirq_inuse(s, pirq)) {
+ continue;
+ }
+
+ goto found;
+ }
+ return -ENOSPC;
+
+ found:
+ pirq_inuse_word(s, pirq) |= pirq_inuse_bit(pirq);
+ if (gsi >= 0) {
+ assert(gsi <= IOAPIC_NUM_PINS);
+ s->gsi_pirq[gsi] = pirq;
+ }
+ s->pirq[pirq].gsi = gsi;
+ return pirq;
+}
+
+bool xen_evtchn_set_gsi(int gsi, int level)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int pirq;
+
+ assert(qemu_mutex_iothread_locked());
+
+ if (!s || gsi < 0 || gsi > IOAPIC_NUM_PINS) {
+ return false;
+ }
+
+ /*
+ * Check that that it *isn't* the event channel GSI, and thus
+ * that we are not recursing and it's safe to take s->port_lock.
+ *
+ * Locking aside, it's perfectly sane to bail out early for that
+ * special case, as it would make no sense for the event channel
+ * GSI to be routed back to event channels, when the delivery
+ * method is to raise the GSI... that recursion wouldn't *just*
+ * be a locking issue.
+ */
+ if (gsi && gsi == s->callback_gsi) {
+ return false;
+ }
+
+ QEMU_LOCK_GUARD(&s->port_lock);
+
+ pirq = s->gsi_pirq[gsi];
+ if (!pirq) {
+ return false;
+ }
+
+ if (level) {
+ int port = s->pirq[pirq].port;
+
+ s->pirq_gsi_set |= (1U << gsi);
+ if (port) {
+ set_port_pending(s, port);
+ }
+ } else {
+ s->pirq_gsi_set &= ~(1U << gsi);
+ }
+ return true;
+}
+
+static uint32_t msi_pirq_target(uint64_t addr, uint32_t data)
+{
+ /* The vector (in low 8 bits of data) must be zero */
+ if (data & 0xff) {
+ return 0;
+ }
+
+ uint32_t pirq = (addr & 0xff000) >> 12;
+ pirq |= (addr >> 32) & 0xffffff00;
+
+ return pirq;
+}
+
+static void do_remove_pci_vector(XenEvtchnState *s, PCIDevice *dev, int vector,
+ int except_pirq)
+{
+ uint32_t pirq;
+
+ for (pirq = 0; pirq < s->nr_pirqs; pirq++) {
+ /*
+ * We could be cleverer here, but it isn't really a fast path, and
+ * this trivial optimisation is enough to let us skip the big gap
+ * in the middle a bit quicker (in terms of both loop iterations,
+ * and cache lines).
+ */
+ if (!(pirq & 63) && !(pirq_inuse_word(s, pirq))) {
+ pirq += 64;
+ continue;
+ }
+ if (except_pirq && pirq == except_pirq) {
+ continue;
+ }
+ if (s->pirq[pirq].dev != dev) {
+ continue;
+ }
+ if (vector != -1 && s->pirq[pirq].vector != vector) {
+ continue;
+ }
+
+ /* It could theoretically be bound to a port already, but that is OK. */
+ s->pirq[pirq].dev = dev;
+ s->pirq[pirq].gsi = IRQ_UNBOUND;
+ s->pirq[pirq].is_msix = false;
+ s->pirq[pirq].vector = 0;
+ s->pirq[pirq].is_masked = false;
+ s->pirq[pirq].is_translated = false;
+ }
+}
+
+void xen_evtchn_remove_pci_device(PCIDevice *dev)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+
+ if (!s) {
+ return;
+ }
+
+ QEMU_LOCK_GUARD(&s->port_lock);
+ do_remove_pci_vector(s, dev, -1, 0);
+}
+
+void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector,
+ uint64_t addr, uint32_t data, bool is_masked)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ uint32_t pirq;
+
+ if (!s) {
+ return;
+ }
+
+ assert(qemu_mutex_iothread_locked());
+
+ pirq = msi_pirq_target(addr, data);
+
+ /*
+ * The PIRQ# must be sane, and there must be an allocated PIRQ in
+ * IRQ_UNBOUND or IRQ_MSI_EMU state to match it.
+ */
+ if (!pirq || pirq >= s->nr_pirqs || !pirq_inuse(s, pirq) ||
+ (s->pirq[pirq].gsi != IRQ_UNBOUND &&
+ s->pirq[pirq].gsi != IRQ_MSI_EMU)) {
+ pirq = 0;
+ }
+
+ if (pirq) {
+ s->pirq[pirq].dev = dev;
+ s->pirq[pirq].gsi = IRQ_MSI_EMU;
+ s->pirq[pirq].is_msix = is_msix;
+ s->pirq[pirq].vector = vector;
+ s->pirq[pirq].is_masked = is_masked;
+ }
+
+ /* Remove any (other) entries for this {device, vector} */
+ do_remove_pci_vector(s, dev, vector, pirq);
+}
+
+int xen_evtchn_translate_pirq_msi(struct kvm_irq_routing_entry *route,
+ uint64_t address, uint32_t data)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ uint32_t pirq, port;
+ CPUState *cpu;
+
+ if (!s) {
+ return 1; /* Not a PIRQ */
+ }
+
+ assert(qemu_mutex_iothread_locked());
+
+ pirq = msi_pirq_target(address, data);
+ if (!pirq || pirq >= s->nr_pirqs) {
+ return 1; /* Not a PIRQ */
+ }
+
+ if (!kvm_xen_has_cap(EVTCHN_2LEVEL)) {
+ return -ENOTSUP;
+ }
+
+ if (s->pirq[pirq].gsi != IRQ_MSI_EMU) {
+ return -EINVAL;
+ }
+
+ /* Remember that KVM tried to translate this. It might need to try again. */
+ s->pirq[pirq].is_translated = true;
+
+ QEMU_LOCK_GUARD(&s->port_lock);
+
+ port = s->pirq[pirq].port;
+ if (!valid_port(port)) {
+ return -EINVAL;
+ }
+
+ cpu = qemu_get_cpu(s->port_table[port].vcpu);
+ if (!cpu) {
+ return -EINVAL;
+ }
+
+ route->type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+ route->u.xen_evtchn.port = port;
+ route->u.xen_evtchn.vcpu = kvm_arch_vcpu_id(cpu);
+ route->u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ return 0; /* Handled */
+}
+
+bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ uint32_t pirq, port;
+
+ if (!s) {
+ return false;
+ }
+
+ assert(qemu_mutex_iothread_locked());
+
+ pirq = msi_pirq_target(address, data);
+ if (!pirq || pirq >= s->nr_pirqs) {
+ return false;
+ }
+
+ QEMU_LOCK_GUARD(&s->port_lock);
+
+ port = s->pirq[pirq].port;
+ if (!valid_port(port)) {
+ return false;
+ }
+
+ set_port_pending(s, port);
+ return true;
+}
+
+int xen_physdev_map_pirq(struct physdev_map_pirq *map)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int pirq = map->pirq;
+ int gsi = map->index;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ QEMU_IOTHREAD_LOCK_GUARD();
+ QEMU_LOCK_GUARD(&s->port_lock);
+
+ if (map->domid != DOMID_SELF && map->domid != xen_domid) {
+ return -EPERM;
+ }
+ if (map->type != MAP_PIRQ_TYPE_GSI) {
+ return -EINVAL;
+ }
+ if (gsi < 0 || gsi >= IOAPIC_NUM_PINS) {
+ return -EINVAL;
+ }
+
+ if (pirq < 0) {
+ pirq = allocate_pirq(s, map->type, gsi);
+ if (pirq < 0) {
+ return pirq;
+ }
+ map->pirq = pirq;
+ } else if (pirq > s->nr_pirqs) {
+ return -EINVAL;
+ } else {
+ /*
+ * User specified a valid-looking PIRQ#. Allow it if it is
+ * allocated and not yet bound, or if it is unallocated
+ */
+ if (pirq_inuse(s, pirq)) {
+ if (s->pirq[pirq].gsi != IRQ_UNBOUND) {
+ return -EBUSY;
+ }
+ } else {
+ /* If it was unused, mark it used now. */
+ pirq_inuse_word(s, pirq) |= pirq_inuse_bit(pirq);
+ }
+ /* Set the mapping in both directions. */
+ s->pirq[pirq].gsi = gsi;
+ s->gsi_pirq[gsi] = pirq;
+ }
+
+ trace_kvm_xen_map_pirq(pirq, gsi);
+ return 0;
+}
+
+int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int pirq = unmap->pirq;
+ int gsi;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (unmap->domid != DOMID_SELF && unmap->domid != xen_domid) {
+ return -EPERM;
+ }
+ if (pirq < 0 || pirq >= s->nr_pirqs) {
+ return -EINVAL;
+ }
+
+ QEMU_IOTHREAD_LOCK_GUARD();
+ qemu_mutex_lock(&s->port_lock);
+
+ if (!pirq_inuse(s, pirq)) {
+ qemu_mutex_unlock(&s->port_lock);
+ return -ENOENT;
+ }
+
+ gsi = s->pirq[pirq].gsi;
+
+ /* We can only unmap GSI PIRQs */
+ if (gsi < 0) {
+ qemu_mutex_unlock(&s->port_lock);
+ return -EINVAL;
+ }
+
+ s->gsi_pirq[gsi] = 0;
+ s->pirq[pirq].gsi = IRQ_UNBOUND; /* Doesn't actually matter because: */
+ pirq_inuse_word(s, pirq) &= ~pirq_inuse_bit(pirq);
+
+ trace_kvm_xen_unmap_pirq(pirq, gsi);
+ qemu_mutex_unlock(&s->port_lock);
+
+ if (gsi == IRQ_MSI_EMU) {
+ kvm_update_msi_routes_all(NULL, true, 0, 0);
+ }
+
+ return 0;
+}
+
+int xen_physdev_eoi_pirq(struct physdev_eoi *eoi)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int pirq = eoi->irq;
+ int gsi;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ QEMU_IOTHREAD_LOCK_GUARD();
+ QEMU_LOCK_GUARD(&s->port_lock);
+
+ if (!pirq_inuse(s, pirq)) {
+ return -ENOENT;
+ }
+
+ gsi = s->pirq[pirq].gsi;
+ if (gsi < 0) {
+ return -EINVAL;
+ }
+
+ /* Reassert a level IRQ if needed */
+ if (s->pirq_gsi_set & (1U << gsi)) {
+ int port = s->pirq[pirq].port;
+ if (port) {
+ set_port_pending(s, port);
+ }
+ }
+
+ return 0;
+}
+
+int xen_physdev_query_pirq(struct physdev_irq_status_query *query)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int pirq = query->irq;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ QEMU_IOTHREAD_LOCK_GUARD();
+ QEMU_LOCK_GUARD(&s->port_lock);
+
+ if (!pirq_inuse(s, pirq)) {
+ return -ENOENT;
+ }
+
+ if (s->pirq[pirq].gsi >= 0) {
+ query->flags = XENIRQSTAT_needs_eoi;
+ } else {
+ query->flags = 0;
+ }
+
+ return 0;
+}
+
+int xen_physdev_get_free_pirq(struct physdev_get_free_pirq *get)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int pirq;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ QEMU_LOCK_GUARD(&s->port_lock);
+
+ pirq = allocate_pirq(s, get->type, IRQ_UNBOUND);
+ if (pirq < 0) {
+ return pirq;
+ }
+
+ get->pirq = pirq;
+ trace_kvm_xen_get_free_pirq(pirq, get->type);
+ return 0;
+}
+
+struct xenevtchn_handle *xen_be_evtchn_open(void)
+{
+ struct xenevtchn_handle *xc = g_new0(struct xenevtchn_handle, 1);
+
+ xc->fd = eventfd(0, EFD_CLOEXEC);
+ if (xc->fd < 0) {
+ free(xc);
+ return NULL;
+ }
+
+ return xc;
+}
+
+static int find_be_port(XenEvtchnState *s, struct xenevtchn_handle *xc)
+{
+ int i;
+
+ for (i = 1; i < EVTCHN_2L_NR_CHANNELS; i++) {
+ if (!s->be_handles[i]) {
+ s->be_handles[i] = xc;
+ xc->be_port = i;
+ return i;
+ }
+ }
+ return 0;
+}
+
+int xen_be_evtchn_bind_interdomain(struct xenevtchn_handle *xc, uint32_t domid,
+ evtchn_port_t guest_port)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ XenEvtchnPort *gp;
+ uint16_t be_port = 0;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (!xc) {
+ return -EFAULT;
+ }
+
+ if (domid != xen_domid) {
+ return -ESRCH;
+ }
+
+ if (!valid_port(guest_port)) {
+ return -EINVAL;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ /* The guest has to have an unbound port waiting for us to bind */
+ gp = &s->port_table[guest_port];
+
+ switch (gp->type) {
+ case EVTCHNSTAT_interdomain:
+ /* Allow rebinding after migration, preserve port # if possible */
+ be_port = gp->type_val & ~PORT_INFO_TYPEVAL_REMOTE_QEMU;
+ assert(be_port != 0);
+ if (!s->be_handles[be_port]) {
+ s->be_handles[be_port] = xc;
+ xc->guest_port = guest_port;
+ ret = xc->be_port = be_port;
+ if (kvm_xen_has_cap(EVTCHN_SEND)) {
+ assign_kernel_eventfd(gp->type, guest_port, xc->fd);
+ }
+ break;
+ }
+ /* fall through */
+
+ case EVTCHNSTAT_unbound:
+ be_port = find_be_port(s, xc);
+ if (!be_port) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ gp->type = EVTCHNSTAT_interdomain;
+ gp->type_val = be_port | PORT_INFO_TYPEVAL_REMOTE_QEMU;
+ xc->guest_port = guest_port;
+ if (kvm_xen_has_cap(EVTCHN_SEND)) {
+ assign_kernel_eventfd(gp->type, guest_port, xc->fd);
+ }
+ ret = be_port;
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ out:
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+int xen_be_evtchn_unbind(struct xenevtchn_handle *xc, evtchn_port_t port)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (!xc) {
+ return -EFAULT;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ if (port && port != xc->be_port) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (xc->guest_port) {
+ XenEvtchnPort *gp = &s->port_table[xc->guest_port];
+
+ /* This should never *not* be true */
+ if (gp->type == EVTCHNSTAT_interdomain) {
+ gp->type = EVTCHNSTAT_unbound;
+ gp->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+ }
+
+ if (kvm_xen_has_cap(EVTCHN_SEND)) {
+ deassign_kernel_port(xc->guest_port);
+ }
+ xc->guest_port = 0;
+ }
+
+ s->be_handles[xc->be_port] = NULL;
+ xc->be_port = 0;
+ ret = 0;
+ out:
+ qemu_mutex_unlock(&s->port_lock);
+ return ret;
+}
+
+int xen_be_evtchn_close(struct xenevtchn_handle *xc)
+{
+ if (!xc) {
+ return -EFAULT;
+ }
+
+ xen_be_evtchn_unbind(xc, 0);
+
+ close(xc->fd);
+ free(xc);
+ return 0;
+}
+
+int xen_be_evtchn_fd(struct xenevtchn_handle *xc)
+{
+ if (!xc) {
+ return -1;
+ }
+ return xc->fd;
+}
+
+int xen_be_evtchn_notify(struct xenevtchn_handle *xc, evtchn_port_t port)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ int ret;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (!xc) {
+ return -EFAULT;
+ }
+
+ qemu_mutex_lock(&s->port_lock);
+
+ if (xc->guest_port) {
+ set_port_pending(s, xc->guest_port);
+ ret = 0;
+ } else {
+ ret = -ENOTCONN;
+ }
+
+ qemu_mutex_unlock(&s->port_lock);
+
+ return ret;
+}
+
+int xen_be_evtchn_pending(struct xenevtchn_handle *xc)
+{
+ uint64_t val;
+
+ if (!xc) {
+ return -EFAULT;
+ }
+
+ if (!xc->be_port) {
+ return 0;
+ }
+
+ if (eventfd_read(xc->fd, &val)) {
+ return -errno;
+ }
+
+ return val ? xc->be_port : 0;
+}
+
+int xen_be_evtchn_unmask(struct xenevtchn_handle *xc, evtchn_port_t port)
+{
+ if (!xc) {
+ return -EFAULT;
+ }
+
+ if (xc->be_port != port) {
+ return -EINVAL;
+ }
+
+ /*
+ * We don't actually do anything to unmask it; the event was already
+ * consumed in xen_be_evtchn_pending().
+ */
+ return 0;
+}
+
+int xen_be_evtchn_get_guest_port(struct xenevtchn_handle *xc)
+{
+ return xc->guest_port;
+}
+
+EvtchnInfoList *qmp_xen_event_list(Error **errp)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+ EvtchnInfoList *head = NULL, **tail = &head;
+ void *shinfo, *pending, *mask;
+ int i;
+
+ if (!s) {
+ error_setg(errp, "Xen event channel emulation not enabled");
+ return NULL;
+ }
+
+ shinfo = xen_overlay_get_shinfo_ptr();
+ if (!shinfo) {
+ error_setg(errp, "Xen shared info page not allocated");
+ return NULL;
+ }
+
+ if (xen_is_long_mode()) {
+ pending = shinfo + offsetof(struct shared_info, evtchn_pending);
+ mask = shinfo + offsetof(struct shared_info, evtchn_mask);
+ } else {
+ pending = shinfo + offsetof(struct compat_shared_info, evtchn_pending);
+ mask = shinfo + offsetof(struct compat_shared_info, evtchn_mask);
+ }
+
+ QEMU_LOCK_GUARD(&s->port_lock);
+
+ for (i = 0; i < s->nr_ports; i++) {
+ XenEvtchnPort *p = &s->port_table[i];
+ EvtchnInfo *info;
+
+ if (p->type == EVTCHNSTAT_closed) {
+ continue;
+ }
+
+ info = g_new0(EvtchnInfo, 1);
+
+ info->port = i;
+ qemu_build_assert(EVTCHN_PORT_TYPE_CLOSED == EVTCHNSTAT_closed);
+ qemu_build_assert(EVTCHN_PORT_TYPE_UNBOUND == EVTCHNSTAT_unbound);
+ qemu_build_assert(EVTCHN_PORT_TYPE_INTERDOMAIN == EVTCHNSTAT_interdomain);
+ qemu_build_assert(EVTCHN_PORT_TYPE_PIRQ == EVTCHNSTAT_pirq);
+ qemu_build_assert(EVTCHN_PORT_TYPE_VIRQ == EVTCHNSTAT_virq);
+ qemu_build_assert(EVTCHN_PORT_TYPE_IPI == EVTCHNSTAT_ipi);
+
+ info->type = p->type;
+ if (p->type == EVTCHNSTAT_interdomain) {
+ info->remote_domain = g_strdup((p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) ?
+ "qemu" : "loopback");
+ info->target = p->type_val & PORT_INFO_TYPEVAL_REMOTE_PORT_MASK;
+ } else {
+ info->target = p->type_val;
+ }
+ info->vcpu = p->vcpu;
+ info->pending = test_bit(i, pending);
+ info->masked = test_bit(i, mask);
+
+ QAPI_LIST_APPEND(tail, info);
+ }
+
+ return head;
+}
+
+void qmp_xen_event_inject(uint32_t port, Error **errp)
+{
+ XenEvtchnState *s = xen_evtchn_singleton;
+
+ if (!s) {
+ error_setg(errp, "Xen event channel emulation not enabled");
+ return;
+ }
+
+ if (!valid_port(port)) {
+ error_setg(errp, "Invalid port %u", port);
+ }
+
+ QEMU_LOCK_GUARD(&s->port_lock);
+
+ if (set_port_pending(s, port)) {
+ error_setg(errp, "Failed to set port %u", port);
+ return;
+ }
+}
+
+void hmp_xen_event_list(Monitor *mon, const QDict *qdict)
+{
+ EvtchnInfoList *iter, *info_list;
+ Error *err = NULL;
+
+ info_list = qmp_xen_event_list(&err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+
+ for (iter = info_list; iter; iter = iter->next) {
+ EvtchnInfo *info = iter->value;
+
+ monitor_printf(mon, "port %4u: vcpu: %d %s", info->port, info->vcpu,
+ EvtchnPortType_str(info->type));
+ if (info->type != EVTCHN_PORT_TYPE_IPI) {
+ monitor_printf(mon, "(");
+ if (info->remote_domain) {
+ monitor_printf(mon, "%s:", info->remote_domain);
+ }
+ monitor_printf(mon, "%d)", info->target);
+ }
+ if (info->pending) {
+ monitor_printf(mon, " PENDING");
+ }
+ if (info->masked) {
+ monitor_printf(mon, " MASKED");
+ }
+ monitor_printf(mon, "\n");
+ }
+
+ qapi_free_EvtchnInfoList(info_list);
+}
+
+void hmp_xen_event_inject(Monitor *mon, const QDict *qdict)
+{
+ int port = qdict_get_int(qdict, "port");
+ Error *err = NULL;
+
+ qmp_xen_event_inject(port, &err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ } else {
+ monitor_printf(mon, "Delivered port %d\n", port);
+ }
+}
+
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
new file mode 100644
index 0000000..bfb67ac
--- /dev/null
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -0,0 +1,88 @@
+/*
+ * QEMU Xen emulation: Event channel support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XEN_EVTCHN_H
+#define QEMU_XEN_EVTCHN_H
+
+#include "hw/sysbus.h"
+
+typedef uint32_t evtchn_port_t;
+
+void xen_evtchn_create(void);
+int xen_evtchn_soft_reset(void);
+int xen_evtchn_set_callback_param(uint64_t param);
+void xen_evtchn_connect_gsis(qemu_irq *system_gsis);
+void xen_evtchn_set_callback_level(int level);
+
+int xen_evtchn_set_port(uint16_t port);
+
+bool xen_evtchn_set_gsi(int gsi, int level);
+void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector,
+ uint64_t addr, uint32_t data, bool is_masked);
+void xen_evtchn_remove_pci_device(PCIDevice *dev);
+struct kvm_irq_routing_entry;
+int xen_evtchn_translate_pirq_msi(struct kvm_irq_routing_entry *route,
+ uint64_t address, uint32_t data);
+bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data);
+
+
+/*
+ * These functions mirror the libxenevtchn library API, providing the QEMU
+ * backend side of "interdomain" event channels.
+ */
+struct xenevtchn_handle;
+struct xenevtchn_handle *xen_be_evtchn_open(void);
+int xen_be_evtchn_bind_interdomain(struct xenevtchn_handle *xc, uint32_t domid,
+ evtchn_port_t guest_port);
+int xen_be_evtchn_unbind(struct xenevtchn_handle *xc, evtchn_port_t port);
+int xen_be_evtchn_close(struct xenevtchn_handle *xc);
+int xen_be_evtchn_fd(struct xenevtchn_handle *xc);
+int xen_be_evtchn_notify(struct xenevtchn_handle *xc, evtchn_port_t port);
+int xen_be_evtchn_unmask(struct xenevtchn_handle *xc, evtchn_port_t port);
+int xen_be_evtchn_pending(struct xenevtchn_handle *xc);
+/* Apart from this which is a local addition */
+int xen_be_evtchn_get_guest_port(struct xenevtchn_handle *xc);
+
+struct evtchn_status;
+struct evtchn_close;
+struct evtchn_unmask;
+struct evtchn_bind_virq;
+struct evtchn_bind_pirq;
+struct evtchn_bind_ipi;
+struct evtchn_send;
+struct evtchn_alloc_unbound;
+struct evtchn_bind_interdomain;
+struct evtchn_bind_vcpu;
+struct evtchn_reset;
+int xen_evtchn_status_op(struct evtchn_status *status);
+int xen_evtchn_close_op(struct evtchn_close *close);
+int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
+int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
+int xen_evtchn_bind_pirq_op(struct evtchn_bind_pirq *pirq);
+int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
+int xen_evtchn_send_op(struct evtchn_send *send);
+int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
+int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain);
+int xen_evtchn_bind_vcpu_op(struct evtchn_bind_vcpu *vcpu);
+int xen_evtchn_reset_op(struct evtchn_reset *reset);
+
+struct physdev_map_pirq;
+struct physdev_unmap_pirq;
+struct physdev_eoi;
+struct physdev_irq_status_query;
+struct physdev_get_free_pirq;
+int xen_physdev_map_pirq(struct physdev_map_pirq *map);
+int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap);
+int xen_physdev_eoi_pirq(struct physdev_eoi *eoi);
+int xen_physdev_query_pirq(struct physdev_irq_status_query *query);
+int xen_physdev_get_free_pirq(struct physdev_get_free_pirq *get);
+
+#endif /* QEMU_XEN_EVTCHN_H */
diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
new file mode 100644
index 0000000..1e691de
--- /dev/null
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -0,0 +1,232 @@
+/*
+ * QEMU Xen emulation: Grant table support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/lockable.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_overlay.h"
+#include "xen_gnttab.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+
+#include "hw/xen/interface/memory.h"
+#include "hw/xen/interface/grant_table.h"
+
+#define TYPE_XEN_GNTTAB "xen-gnttab"
+OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB)
+
+#define XEN_PAGE_SHIFT 12
+#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
+
+#define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
+
+struct XenGnttabState {
+ /*< private >*/
+ SysBusDevice busdev;
+ /*< public >*/
+
+ QemuMutex gnt_lock;
+
+ uint32_t nr_frames;
+ uint32_t max_frames;
+
+ union {
+ grant_entry_v1_t *v1;
+ /* Theoretically, v2 support could be added here. */
+ } entries;
+
+ MemoryRegion gnt_frames;
+ MemoryRegion *gnt_aliases;
+ uint64_t *gnt_frame_gpas;
+};
+
+struct XenGnttabState *xen_gnttab_singleton;
+
+static void xen_gnttab_realize(DeviceState *dev, Error **errp)
+{
+ XenGnttabState *s = XEN_GNTTAB(dev);
+ int i;
+
+ if (xen_mode != XEN_EMULATE) {
+ error_setg(errp, "Xen grant table support is for Xen emulation");
+ return;
+ }
+ s->nr_frames = 0;
+ s->max_frames = kvm_xen_get_gnttab_max_frames();
+ memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table",
+ XEN_PAGE_SIZE * s->max_frames, &error_abort);
+ memory_region_set_enabled(&s->gnt_frames, true);
+ s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames);
+ memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
+
+ /* Create individual page-sizes aliases for overlays */
+ s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames);
+ s->gnt_frame_gpas = (void *)g_new(uint64_t, s->max_frames);
+ for (i = 0; i < s->max_frames; i++) {
+ memory_region_init_alias(&s->gnt_aliases[i], OBJECT(dev),
+ NULL, &s->gnt_frames,
+ i * XEN_PAGE_SIZE, XEN_PAGE_SIZE);
+ s->gnt_frame_gpas[i] = INVALID_GPA;
+ }
+
+ qemu_mutex_init(&s->gnt_lock);
+
+ xen_gnttab_singleton = s;
+}
+
+static int xen_gnttab_post_load(void *opaque, int version_id)
+{
+ XenGnttabState *s = XEN_GNTTAB(opaque);
+ uint32_t i;
+
+ for (i = 0; i < s->nr_frames; i++) {
+ if (s->gnt_frame_gpas[i] != INVALID_GPA) {
+ xen_overlay_do_map_page(&s->gnt_aliases[i], s->gnt_frame_gpas[i]);
+ }
+ }
+ return 0;
+}
+
+static bool xen_gnttab_is_needed(void *opaque)
+{
+ return xen_mode == XEN_EMULATE;
+}
+
+static const VMStateDescription xen_gnttab_vmstate = {
+ .name = "xen_gnttab",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = xen_gnttab_is_needed,
+ .post_load = xen_gnttab_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(nr_frames, XenGnttabState),
+ VMSTATE_VARRAY_UINT32(gnt_frame_gpas, XenGnttabState, nr_frames, 0,
+ vmstate_info_uint64, uint64_t),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void xen_gnttab_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = xen_gnttab_realize;
+ dc->vmsd = &xen_gnttab_vmstate;
+}
+
+static const TypeInfo xen_gnttab_info = {
+ .name = TYPE_XEN_GNTTAB,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(XenGnttabState),
+ .class_init = xen_gnttab_class_init,
+};
+
+void xen_gnttab_create(void)
+{
+ xen_gnttab_singleton = XEN_GNTTAB(sysbus_create_simple(TYPE_XEN_GNTTAB,
+ -1, NULL));
+}
+
+static void xen_gnttab_register_types(void)
+{
+ type_register_static(&xen_gnttab_info);
+}
+
+type_init(xen_gnttab_register_types)
+
+int xen_gnttab_map_page(uint64_t idx, uint64_t gfn)
+{
+ XenGnttabState *s = xen_gnttab_singleton;
+ uint64_t gpa = gfn << XEN_PAGE_SHIFT;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (idx >= s->max_frames) {
+ return -EINVAL;
+ }
+
+ QEMU_IOTHREAD_LOCK_GUARD();
+ QEMU_LOCK_GUARD(&s->gnt_lock);
+
+ xen_overlay_do_map_page(&s->gnt_aliases[idx], gpa);
+
+ s->gnt_frame_gpas[idx] = gpa;
+
+ if (s->nr_frames <= idx) {
+ s->nr_frames = idx + 1;
+ }
+
+ return 0;
+}
+
+int xen_gnttab_set_version_op(struct gnttab_set_version *set)
+{
+ int ret;
+
+ switch (set->version) {
+ case 1:
+ ret = 0;
+ break;
+
+ case 2:
+ /* Behave as before set_version was introduced. */
+ ret = -ENOSYS;
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+ set->version = 1;
+ return ret;
+}
+
+int xen_gnttab_get_version_op(struct gnttab_get_version *get)
+{
+ if (get->dom != DOMID_SELF && get->dom != xen_domid) {
+ return -ESRCH;
+ }
+
+ get->version = 1;
+ return 0;
+}
+
+int xen_gnttab_query_size_op(struct gnttab_query_size *size)
+{
+ XenGnttabState *s = xen_gnttab_singleton;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ if (size->dom != DOMID_SELF && size->dom != xen_domid) {
+ size->status = GNTST_bad_domain;
+ return 0;
+ }
+
+ size->status = GNTST_okay;
+ size->nr_frames = s->nr_frames;
+ size->max_nr_frames = s->max_frames;
+ return 0;
+}
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
new file mode 100644
index 0000000..3bdbe96
--- /dev/null
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -0,0 +1,25 @@
+/*
+ * QEMU Xen emulation: Grant table support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XEN_GNTTAB_H
+#define QEMU_XEN_GNTTAB_H
+
+void xen_gnttab_create(void);
+int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
+
+struct gnttab_set_version;
+struct gnttab_get_version;
+struct gnttab_query_size;
+int xen_gnttab_set_version_op(struct gnttab_set_version *set);
+int xen_gnttab_get_version_op(struct gnttab_get_version *get);
+int xen_gnttab_query_size_op(struct gnttab_query_size *size);
+
+#endif /* QEMU_XEN_GNTTAB_H */
diff --git a/hw/i386/kvm/xen_overlay.c b/hw/i386/kvm/xen_overlay.c
new file mode 100644
index 0000000..39fda1b
--- /dev/null
+++ b/hw/i386/kvm/xen_overlay.c
@@ -0,0 +1,272 @@
+/*
+ * QEMU Xen emulation: Shared/overlay pages support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_overlay.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+#include <linux/kvm.h>
+
+#include "hw/xen/interface/memory.h"
+
+
+#define TYPE_XEN_OVERLAY "xen-overlay"
+OBJECT_DECLARE_SIMPLE_TYPE(XenOverlayState, XEN_OVERLAY)
+
+#define XEN_PAGE_SHIFT 12
+#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
+
+struct XenOverlayState {
+ /*< private >*/
+ SysBusDevice busdev;
+ /*< public >*/
+
+ MemoryRegion shinfo_mem;
+ void *shinfo_ptr;
+ uint64_t shinfo_gpa;
+ bool long_mode;
+};
+
+struct XenOverlayState *xen_overlay_singleton;
+
+void xen_overlay_do_map_page(MemoryRegion *page, uint64_t gpa)
+{
+ /*
+ * Xen allows guests to map the same page as many times as it likes
+ * into guest physical frames. We don't, because it would be hard
+ * to track and restore them all. One mapping of each page is
+ * perfectly sufficient for all known guests... and we've tested
+ * that theory on a few now in other implementations. dwmw2.
+ */
+ if (memory_region_is_mapped(page)) {
+ if (gpa == INVALID_GPA) {
+ memory_region_del_subregion(get_system_memory(), page);
+ } else {
+ /* Just move it */
+ memory_region_set_address(page, gpa);
+ }
+ } else if (gpa != INVALID_GPA) {
+ memory_region_add_subregion_overlap(get_system_memory(), gpa, page, 0);
+ }
+}
+
+/* KVM is the only existing back end for now. Let's not overengineer it yet. */
+static int xen_overlay_set_be_shinfo(uint64_t gfn)
+{
+ struct kvm_xen_hvm_attr xa = {
+ .type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+ .u.shared_info.gfn = gfn,
+ };
+
+ return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+}
+
+
+static void xen_overlay_realize(DeviceState *dev, Error **errp)
+{
+ XenOverlayState *s = XEN_OVERLAY(dev);
+
+ if (xen_mode != XEN_EMULATE) {
+ error_setg(errp, "Xen overlay page support is for Xen emulation");
+ return;
+ }
+
+ memory_region_init_ram(&s->shinfo_mem, OBJECT(dev), "xen:shared_info",
+ XEN_PAGE_SIZE, &error_abort);
+ memory_region_set_enabled(&s->shinfo_mem, true);
+
+ s->shinfo_ptr = memory_region_get_ram_ptr(&s->shinfo_mem);
+ s->shinfo_gpa = INVALID_GPA;
+ s->long_mode = false;
+ memset(s->shinfo_ptr, 0, XEN_PAGE_SIZE);
+}
+
+static int xen_overlay_pre_save(void *opaque)
+{
+ /*
+ * Fetch the kernel's idea of long_mode to avoid the race condition
+ * where the guest has set the hypercall page up in 64-bit mode but
+ * not yet made a hypercall by the time migration happens, so qemu
+ * hasn't yet noticed.
+ */
+ return xen_sync_long_mode();
+}
+
+static int xen_overlay_post_load(void *opaque, int version_id)
+{
+ XenOverlayState *s = opaque;
+
+ if (s->shinfo_gpa != INVALID_GPA) {
+ xen_overlay_do_map_page(&s->shinfo_mem, s->shinfo_gpa);
+ xen_overlay_set_be_shinfo(s->shinfo_gpa >> XEN_PAGE_SHIFT);
+ }
+ if (s->long_mode) {
+ xen_set_long_mode(true);
+ }
+
+ return 0;
+}
+
+static bool xen_overlay_is_needed(void *opaque)
+{
+ return xen_mode == XEN_EMULATE;
+}
+
+static const VMStateDescription xen_overlay_vmstate = {
+ .name = "xen_overlay",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = xen_overlay_is_needed,
+ .pre_save = xen_overlay_pre_save,
+ .post_load = xen_overlay_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(shinfo_gpa, XenOverlayState),
+ VMSTATE_BOOL(long_mode, XenOverlayState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void xen_overlay_reset(DeviceState *dev)
+{
+ kvm_xen_soft_reset();
+}
+
+static void xen_overlay_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->reset = xen_overlay_reset;
+ dc->realize = xen_overlay_realize;
+ dc->vmsd = &xen_overlay_vmstate;
+}
+
+static const TypeInfo xen_overlay_info = {
+ .name = TYPE_XEN_OVERLAY,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(XenOverlayState),
+ .class_init = xen_overlay_class_init,
+};
+
+void xen_overlay_create(void)
+{
+ xen_overlay_singleton = XEN_OVERLAY(sysbus_create_simple(TYPE_XEN_OVERLAY,
+ -1, NULL));
+
+ /* If xen_domid wasn't explicitly set, at least make sure it isn't zero. */
+ if (xen_domid == DOMID_QEMU) {
+ xen_domid = 1;
+ };
+}
+
+static void xen_overlay_register_types(void)
+{
+ type_register_static(&xen_overlay_info);
+}
+
+type_init(xen_overlay_register_types)
+
+int xen_overlay_map_shinfo_page(uint64_t gpa)
+{
+ XenOverlayState *s = xen_overlay_singleton;
+ int ret;
+
+ if (!s) {
+ return -ENOENT;
+ }
+
+ assert(qemu_mutex_iothread_locked());
+
+ if (s->shinfo_gpa) {
+ /* If removing shinfo page, turn the kernel magic off first */
+ ret = xen_overlay_set_be_shinfo(INVALID_GFN);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ xen_overlay_do_map_page(&s->shinfo_mem, gpa);
+ if (gpa != INVALID_GPA) {
+ ret = xen_overlay_set_be_shinfo(gpa >> XEN_PAGE_SHIFT);
+ if (ret) {
+ return ret;
+ }
+ }
+ s->shinfo_gpa = gpa;
+
+ return 0;
+}
+
+void *xen_overlay_get_shinfo_ptr(void)
+{
+ XenOverlayState *s = xen_overlay_singleton;
+
+ if (!s) {
+ return NULL;
+ }
+
+ return s->shinfo_ptr;
+}
+
+int xen_sync_long_mode(void)
+{
+ int ret;
+ struct kvm_xen_hvm_attr xa = {
+ .type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+ };
+
+ if (!xen_overlay_singleton) {
+ return -ENOENT;
+ }
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_GET_ATTR, &xa);
+ if (!ret) {
+ xen_overlay_singleton->long_mode = xa.u.long_mode;
+ }
+
+ return ret;
+}
+
+int xen_set_long_mode(bool long_mode)
+{
+ int ret;
+ struct kvm_xen_hvm_attr xa = {
+ .type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+ .u.long_mode = long_mode,
+ };
+
+ if (!xen_overlay_singleton) {
+ return -ENOENT;
+ }
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+ if (!ret) {
+ xen_overlay_singleton->long_mode = xa.u.long_mode;
+ }
+
+ return ret;
+}
+
+bool xen_is_long_mode(void)
+{
+ return xen_overlay_singleton && xen_overlay_singleton->long_mode;
+}
diff --git a/hw/i386/kvm/xen_overlay.h b/hw/i386/kvm/xen_overlay.h
new file mode 100644
index 0000000..75ecb6b
--- /dev/null
+++ b/hw/i386/kvm/xen_overlay.h
@@ -0,0 +1,26 @@
+/*
+ * QEMU Xen emulation: Shared/overlay pages support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XEN_OVERLAY_H
+#define QEMU_XEN_OVERLAY_H
+
+void xen_overlay_create(void);
+
+int xen_overlay_map_shinfo_page(uint64_t gpa);
+void *xen_overlay_get_shinfo_ptr(void);
+
+int xen_sync_long_mode(void);
+int xen_set_long_mode(bool long_mode);
+bool xen_is_long_mode(void);
+
+void xen_overlay_do_map_page(MemoryRegion *page, uint64_t gpa);
+
+#endif /* QEMU_XEN_OVERLAY_H */
diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
new file mode 100644
index 0000000..14193ef
--- /dev/null
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -0,0 +1,500 @@
+/*
+ * QEMU Xen emulation: Shared/overlay pages support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/main-loop.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_overlay.h"
+#include "xen_evtchn.h"
+#include "xen_xenstore.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+
+#include "hw/xen/interface/io/xs_wire.h"
+#include "hw/xen/interface/event_channel.h"
+
+#define TYPE_XEN_XENSTORE "xen-xenstore"
+OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE)
+
+#define XEN_PAGE_SHIFT 12
+#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
+
+#define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
+#define ENTRIES_PER_FRAME_V2 (XEN_PAGE_SIZE / sizeof(grant_entry_v2_t))
+
+#define XENSTORE_HEADER_SIZE ((unsigned int)sizeof(struct xsd_sockmsg))
+
+struct XenXenstoreState {
+ /*< private >*/
+ SysBusDevice busdev;
+ /*< public >*/
+
+ MemoryRegion xenstore_page;
+ struct xenstore_domain_interface *xs;
+ uint8_t req_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX];
+ uint8_t rsp_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX];
+ uint32_t req_offset;
+ uint32_t rsp_offset;
+ bool rsp_pending;
+ bool fatal_error;
+
+ evtchn_port_t guest_port;
+ evtchn_port_t be_port;
+ struct xenevtchn_handle *eh;
+};
+
+struct XenXenstoreState *xen_xenstore_singleton;
+
+static void xen_xenstore_event(void *opaque);
+
+static void xen_xenstore_realize(DeviceState *dev, Error **errp)
+{
+ XenXenstoreState *s = XEN_XENSTORE(dev);
+
+ if (xen_mode != XEN_EMULATE) {
+ error_setg(errp, "Xen xenstore support is for Xen emulation");
+ return;
+ }
+ memory_region_init_ram(&s->xenstore_page, OBJECT(dev), "xen:xenstore_page",
+ XEN_PAGE_SIZE, &error_abort);
+ memory_region_set_enabled(&s->xenstore_page, true);
+ s->xs = memory_region_get_ram_ptr(&s->xenstore_page);
+ memset(s->xs, 0, XEN_PAGE_SIZE);
+
+ /* We can't map it this early as KVM isn't ready */
+ xen_xenstore_singleton = s;
+
+ s->eh = xen_be_evtchn_open();
+ if (!s->eh) {
+ error_setg(errp, "Xenstore evtchn port init failed");
+ return;
+ }
+ aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true,
+ xen_xenstore_event, NULL, NULL, NULL, s);
+}
+
+static bool xen_xenstore_is_needed(void *opaque)
+{
+ return xen_mode == XEN_EMULATE;
+}
+
+static int xen_xenstore_pre_save(void *opaque)
+{
+ XenXenstoreState *s = opaque;
+
+ if (s->eh) {
+ s->guest_port = xen_be_evtchn_get_guest_port(s->eh);
+ }
+ return 0;
+}
+
+static int xen_xenstore_post_load(void *opaque, int ver)
+{
+ XenXenstoreState *s = opaque;
+
+ /*
+ * As qemu/dom0, rebind to the guest's port. The Windows drivers may
+ * unbind the XenStore evtchn and rebind to it, having obtained the
+ * "remote" port through EVTCHNOP_status. In the case that migration
+ * occurs while it's unbound, the "remote" port needs to be the same
+ * as before so that the guest can find it, but should remain unbound.
+ */
+ if (s->guest_port) {
+ int be_port = xen_be_evtchn_bind_interdomain(s->eh, xen_domid,
+ s->guest_port);
+ if (be_port < 0) {
+ return be_port;
+ }
+ s->be_port = be_port;
+ }
+ return 0;
+}
+
+static const VMStateDescription xen_xenstore_vmstate = {
+ .name = "xen_xenstore",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = xen_xenstore_is_needed,
+ .pre_save = xen_xenstore_pre_save,
+ .post_load = xen_xenstore_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT8_ARRAY(req_data, XenXenstoreState,
+ sizeof_field(XenXenstoreState, req_data)),
+ VMSTATE_UINT8_ARRAY(rsp_data, XenXenstoreState,
+ sizeof_field(XenXenstoreState, rsp_data)),
+ VMSTATE_UINT32(req_offset, XenXenstoreState),
+ VMSTATE_UINT32(rsp_offset, XenXenstoreState),
+ VMSTATE_BOOL(rsp_pending, XenXenstoreState),
+ VMSTATE_UINT32(guest_port, XenXenstoreState),
+ VMSTATE_BOOL(fatal_error, XenXenstoreState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void xen_xenstore_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = xen_xenstore_realize;
+ dc->vmsd = &xen_xenstore_vmstate;
+}
+
+static const TypeInfo xen_xenstore_info = {
+ .name = TYPE_XEN_XENSTORE,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(XenXenstoreState),
+ .class_init = xen_xenstore_class_init,
+};
+
+void xen_xenstore_create(void)
+{
+ DeviceState *dev = sysbus_create_simple(TYPE_XEN_XENSTORE, -1, NULL);
+
+ xen_xenstore_singleton = XEN_XENSTORE(dev);
+
+ /*
+ * Defer the init (xen_xenstore_reset()) until KVM is set up and the
+ * overlay page can be mapped.
+ */
+}
+
+static void xen_xenstore_register_types(void)
+{
+ type_register_static(&xen_xenstore_info);
+}
+
+type_init(xen_xenstore_register_types)
+
+uint16_t xen_xenstore_get_port(void)
+{
+ XenXenstoreState *s = xen_xenstore_singleton;
+ if (!s) {
+ return 0;
+ }
+ return s->guest_port;
+}
+
+static bool req_pending(XenXenstoreState *s)
+{
+ struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
+
+ return s->req_offset == XENSTORE_HEADER_SIZE + req->len;
+}
+
+static void reset_req(XenXenstoreState *s)
+{
+ memset(s->req_data, 0, sizeof(s->req_data));
+ s->req_offset = 0;
+}
+
+static void reset_rsp(XenXenstoreState *s)
+{
+ s->rsp_pending = false;
+
+ memset(s->rsp_data, 0, sizeof(s->rsp_data));
+ s->rsp_offset = 0;
+}
+
+static void process_req(XenXenstoreState *s)
+{
+ struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
+ struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+ const char enosys[] = "ENOSYS";
+
+ assert(req_pending(s));
+ assert(!s->rsp_pending);
+
+ rsp->type = XS_ERROR;
+ rsp->req_id = req->req_id;
+ rsp->tx_id = req->tx_id;
+ rsp->len = sizeof(enosys);
+ memcpy((void *)&rsp[1], enosys, sizeof(enosys));
+
+ s->rsp_pending = true;
+ reset_req(s);
+}
+
+static unsigned int copy_from_ring(XenXenstoreState *s, uint8_t *ptr,
+ unsigned int len)
+{
+ if (!len) {
+ return 0;
+ }
+
+ XENSTORE_RING_IDX prod = qatomic_read(&s->xs->req_prod);
+ XENSTORE_RING_IDX cons = qatomic_read(&s->xs->req_cons);
+ unsigned int copied = 0;
+
+ /* Ensure the ring contents don't cross the req_prod access. */
+ smp_rmb();
+
+ while (len) {
+ unsigned int avail = prod - cons;
+ unsigned int offset = MASK_XENSTORE_IDX(cons);
+ unsigned int copylen = avail;
+
+ if (avail > XENSTORE_RING_SIZE) {
+ error_report("XenStore ring handling error");
+ s->fatal_error = true;
+ break;
+ } else if (avail == 0) {
+ break;
+ }
+
+ if (copylen > len) {
+ copylen = len;
+ }
+ if (copylen > XENSTORE_RING_SIZE - offset) {
+ copylen = XENSTORE_RING_SIZE - offset;
+ }
+
+ memcpy(ptr, &s->xs->req[offset], copylen);
+ copied += copylen;
+
+ ptr += copylen;
+ len -= copylen;
+
+ cons += copylen;
+ }
+
+ /*
+ * Not sure this ever mattered except on Alpha, but this barrier
+ * is to ensure that the update to req_cons is globally visible
+ * only after we have consumed all the data from the ring, and we
+ * don't end up seeing data written to the ring *after* the other
+ * end sees the update and writes more to the ring. Xen's own
+ * xenstored has the same barrier here (although with no comment
+ * at all, obviously, because it's Xen code).
+ */
+ smp_mb();
+
+ qatomic_set(&s->xs->req_cons, cons);
+
+ return copied;
+}
+
+static unsigned int copy_to_ring(XenXenstoreState *s, uint8_t *ptr,
+ unsigned int len)
+{
+ if (!len) {
+ return 0;
+ }
+
+ XENSTORE_RING_IDX cons = qatomic_read(&s->xs->rsp_cons);
+ XENSTORE_RING_IDX prod = qatomic_read(&s->xs->rsp_prod);
+ unsigned int copied = 0;
+
+ /*
+ * This matches the barrier in copy_to_ring() (or the guest's
+ * equivalent) betweem writing the data to the ring and updating
+ * rsp_prod. It protects against the pathological case (which
+ * again I think never happened except on Alpha) where our
+ * subsequent writes to the ring could *cross* the read of
+ * rsp_cons and the guest could see the new data when it was
+ * intending to read the old.
+ */
+ smp_mb();
+
+ while (len) {
+ unsigned int avail = cons + XENSTORE_RING_SIZE - prod;
+ unsigned int offset = MASK_XENSTORE_IDX(prod);
+ unsigned int copylen = len;
+
+ if (avail > XENSTORE_RING_SIZE) {
+ error_report("XenStore ring handling error");
+ s->fatal_error = true;
+ break;
+ } else if (avail == 0) {
+ break;
+ }
+
+ if (copylen > avail) {
+ copylen = avail;
+ }
+ if (copylen > XENSTORE_RING_SIZE - offset) {
+ copylen = XENSTORE_RING_SIZE - offset;
+ }
+
+
+ memcpy(&s->xs->rsp[offset], ptr, copylen);
+ copied += copylen;
+
+ ptr += copylen;
+ len -= copylen;
+
+ prod += copylen;
+ }
+
+ /* Ensure the ring contents are seen before rsp_prod update. */
+ smp_wmb();
+
+ qatomic_set(&s->xs->rsp_prod, prod);
+
+ return copied;
+}
+
+static unsigned int get_req(XenXenstoreState *s)
+{
+ unsigned int copied = 0;
+
+ if (s->fatal_error) {
+ return 0;
+ }
+
+ assert(!req_pending(s));
+
+ if (s->req_offset < XENSTORE_HEADER_SIZE) {
+ void *ptr = s->req_data + s->req_offset;
+ unsigned int len = XENSTORE_HEADER_SIZE;
+ unsigned int copylen = copy_from_ring(s, ptr, len);
+
+ copied += copylen;
+ s->req_offset += copylen;
+ }
+
+ if (s->req_offset >= XENSTORE_HEADER_SIZE) {
+ struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
+
+ if (req->len > (uint32_t)XENSTORE_PAYLOAD_MAX) {
+ error_report("Illegal XenStore request");
+ s->fatal_error = true;
+ return 0;
+ }
+
+ void *ptr = s->req_data + s->req_offset;
+ unsigned int len = XENSTORE_HEADER_SIZE + req->len - s->req_offset;
+ unsigned int copylen = copy_from_ring(s, ptr, len);
+
+ copied += copylen;
+ s->req_offset += copylen;
+ }
+
+ return copied;
+}
+
+static unsigned int put_rsp(XenXenstoreState *s)
+{
+ if (s->fatal_error) {
+ return 0;
+ }
+
+ assert(s->rsp_pending);
+
+ struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+ assert(s->rsp_offset < XENSTORE_HEADER_SIZE + rsp->len);
+
+ void *ptr = s->rsp_data + s->rsp_offset;
+ unsigned int len = XENSTORE_HEADER_SIZE + rsp->len - s->rsp_offset;
+ unsigned int copylen = copy_to_ring(s, ptr, len);
+
+ s->rsp_offset += copylen;
+
+ /* Have we produced a complete response? */
+ if (s->rsp_offset == XENSTORE_HEADER_SIZE + rsp->len) {
+ reset_rsp(s);
+ }
+
+ return copylen;
+}
+
+static void xen_xenstore_event(void *opaque)
+{
+ XenXenstoreState *s = opaque;
+ evtchn_port_t port = xen_be_evtchn_pending(s->eh);
+ unsigned int copied_to, copied_from;
+ bool processed, notify = false;
+
+ if (port != s->be_port) {
+ return;
+ }
+
+ /* We know this is a no-op. */
+ xen_be_evtchn_unmask(s->eh, port);
+
+ do {
+ copied_to = copied_from = 0;
+ processed = false;
+
+ if (s->rsp_pending) {
+ copied_to = put_rsp(s);
+ }
+
+ if (!req_pending(s)) {
+ copied_from = get_req(s);
+ }
+
+ if (req_pending(s) && !s->rsp_pending) {
+ process_req(s);
+ processed = true;
+ }
+
+ notify |= copied_to || copied_from;
+ } while (copied_to || copied_from || processed);
+
+ if (notify) {
+ xen_be_evtchn_notify(s->eh, s->be_port);
+ }
+}
+
+static void alloc_guest_port(XenXenstoreState *s)
+{
+ struct evtchn_alloc_unbound alloc = {
+ .dom = DOMID_SELF,
+ .remote_dom = DOMID_QEMU,
+ };
+
+ if (!xen_evtchn_alloc_unbound_op(&alloc)) {
+ s->guest_port = alloc.port;
+ }
+}
+
+int xen_xenstore_reset(void)
+{
+ XenXenstoreState *s = xen_xenstore_singleton;
+ int err;
+
+ if (!s) {
+ return -ENOTSUP;
+ }
+
+ s->req_offset = s->rsp_offset = 0;
+ s->rsp_pending = false;
+
+ if (!memory_region_is_mapped(&s->xenstore_page)) {
+ uint64_t gpa = XEN_SPECIAL_PFN(XENSTORE) << TARGET_PAGE_BITS;
+ xen_overlay_do_map_page(&s->xenstore_page, gpa);
+ }
+
+ alloc_guest_port(s);
+
+ /*
+ * As qemu/dom0, bind to the guest's port. For incoming migration, this
+ * will be unbound as the guest's evtchn table is overwritten. We then
+ * rebind to the correct guest port in xen_xenstore_post_load().
+ */
+ err = xen_be_evtchn_bind_interdomain(s->eh, xen_domid, s->guest_port);
+ if (err < 0) {
+ return err;
+ }
+ s->be_port = err;
+
+ return 0;
+}
diff --git a/hw/i386/kvm/xen_xenstore.h b/hw/i386/kvm/xen_xenstore.h
new file mode 100644
index 0000000..8c3768e
--- /dev/null
+++ b/hw/i386/kvm/xen_xenstore.h
@@ -0,0 +1,20 @@
+/*
+ * QEMU Xen emulation: Xenstore emulation
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XEN_XENSTORE_H
+#define QEMU_XEN_XENSTORE_H
+
+void xen_xenstore_create(void);
+int xen_xenstore_reset(void);
+
+uint16_t xen_xenstore_get_port(void);
+
+#endif /* QEMU_XEN_XENSTORE_H */
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 992951c..fd17ce7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -90,6 +90,10 @@
#include "hw/virtio/virtio-iommu.h"
#include "hw/virtio/virtio-pmem-pci.h"
#include "hw/virtio/virtio-mem-pci.h"
+#include "hw/i386/kvm/xen_overlay.h"
+#include "hw/i386/kvm/xen_evtchn.h"
+#include "hw/i386/kvm/xen_gnttab.h"
+#include "hw/i386/kvm/xen_xenstore.h"
#include "hw/mem/memory-device.h"
#include "sysemu/replay.h"
#include "target/i386/cpu.h"
@@ -1308,6 +1312,15 @@ void pc_basic_device_init(struct PCMachineState *pcms,
}
*rtc_state = ISA_DEVICE(mc146818_rtc_init(isa_bus, 2000, rtc_irq));
+#ifdef CONFIG_XEN_EMU
+ if (xen_mode == XEN_EMULATE) {
+ xen_evtchn_connect_gsis(gsi);
+ if (pcms->bus) {
+ pci_create_simple(pcms->bus, -1, "xen-platform");
+ }
+ }
+#endif
+
qemu_register_boot_set(pc_boot_set, *rtc_state);
if (!xen_enabled() &&
@@ -1846,6 +1859,19 @@ static void pc_machine_initfn(Object *obj)
cxl_machine_init(obj, &pcms->cxl_devices_state);
}
+int pc_machine_kvm_type(MachineState *machine, const char *kvm_type)
+{
+#ifdef CONFIG_XEN_EMU
+ if (xen_mode == XEN_EMULATE) {
+ xen_overlay_create();
+ xen_evtchn_create();
+ xen_gnttab_create();
+ xen_xenstore_create();
+ }
+#endif
+ return 0;
+}
+
static void pc_machine_reset(MachineState *machine, ShutdownCause reason)
{
CPUState *cs;
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index c44846f..a56b10b 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -61,6 +61,11 @@
#include CONFIG_DEVICES
#include "kvm/kvm_i386.h"
+#ifdef CONFIG_XEN_EMU
+#include "hw/xen/xen.h"
+#include "hw/i386/kvm/xen_evtchn.h"
+#endif
+
/* Physical Address of PVH entry point read from kernel ELF NOTE */
static size_t pvh_start_addr;
@@ -610,6 +615,17 @@ void gsi_handler(void *opaque, int n, int level)
}
/* fall through */
case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1:
+#ifdef CONFIG_XEN_EMU
+ /*
+ * Xen delivers the GSI to the Legacy PIC (not that Legacy PIC
+ * routing actually works properly under Xen). And then to
+ * *either* the PIRQ handling or the I/OAPIC depending on
+ * whether the former wants it.
+ */
+ if (xen_mode == XEN_EMULATE && xen_evtchn_set_gsi(n, level)) {
+ break;
+ }
+#endif
qemu_set_irq(s->ioapic_irq[n], level);
break;
case IO_APIC_SECONDARY_IRQBASE
diff --git a/hw/i386/xen/meson.build b/hw/i386/xen/meson.build
index be84130..2e64a34 100644
--- a/hw/i386/xen/meson.build
+++ b/hw/i386/xen/meson.build
@@ -2,6 +2,9 @@ i386_ss.add(when: 'CONFIG_XEN', if_true: files(
'xen-hvm.c',
'xen-mapcache.c',
'xen_apic.c',
- 'xen_platform.c',
'xen_pvdevice.c',
))
+
+i386_ss.add(when: 'CONFIG_XEN_BUS', if_true: files(
+ 'xen_platform.c',
+))
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index b9a6f7f..e5a1dd1 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -1502,13 +1502,7 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory)
device_listener_register(&state->device_listener);
xen_bus_init();
-
- /* Initialize backend core & drivers */
- if (xen_be_init() != 0) {
- error_report("xen backend core setup failed");
- goto err;
- }
- xen_be_register_common();
+ xen_be_init();
QLIST_INIT(&xen_physmap);
xen_read_physmap(state);
diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
index 3795a20..539f7da 100644
--- a/hw/i386/xen/xen_platform.c
+++ b/hw/i386/xen/xen_platform.c
@@ -27,9 +27,9 @@
#include "qapi/error.h"
#include "hw/ide/pci.h"
#include "hw/pci/pci.h"
-#include "hw/xen/xen_common.h"
#include "migration/vmstate.h"
-#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen.h"
+#include "net/net.h"
#include "trace.h"
#include "sysemu/xen.h"
#include "sysemu/block-backend.h"
@@ -37,6 +37,11 @@
#include "qemu/module.h"
#include "qom/object.h"
+#ifdef CONFIG_XEN
+#include "hw/xen/xen_common.h"
+#include "hw/xen/xen-legacy-backend.h"
+#endif
+
//#define DEBUG_PLATFORM
#ifdef DEBUG_PLATFORM
@@ -108,12 +113,25 @@ static void log_writeb(PCIXenPlatformState *s, char val)
#define _UNPLUG_NVME_DISKS 3
#define UNPLUG_NVME_DISKS (1u << _UNPLUG_NVME_DISKS)
+static bool pci_device_is_passthrough(PCIDevice *d)
+{
+ if (!strcmp(d->name, "xen-pci-passthrough")) {
+ return true;
+ }
+
+ if (xen_mode == XEN_EMULATE && !strcmp(d->name, "vfio-pci")) {
+ return true;
+ }
+
+ return false;
+}
+
static void unplug_nic(PCIBus *b, PCIDevice *d, void *o)
{
/* We have to ignore passthrough devices */
if (pci_get_word(d->config + PCI_CLASS_DEVICE) ==
PCI_CLASS_NETWORK_ETHERNET
- && strcmp(d->name, "xen-pci-passthrough") != 0) {
+ && !pci_device_is_passthrough(d)) {
object_unparent(OBJECT(d));
}
}
@@ -186,9 +204,8 @@ static void unplug_disks(PCIBus *b, PCIDevice *d, void *opaque)
!(flags & UNPLUG_IDE_SCSI_DISKS);
/* We have to ignore passthrough devices */
- if (!strcmp(d->name, "xen-pci-passthrough")) {
+ if (pci_device_is_passthrough(d))
return;
- }
switch (pci_get_word(d->config + PCI_CLASS_DEVICE)) {
case PCI_CLASS_STORAGE_IDE:
@@ -267,18 +284,26 @@ static void platform_fixed_ioport_writeb(void *opaque, uint32_t addr, uint32_t v
PCIXenPlatformState *s = opaque;
switch (addr) {
- case 0: /* Platform flags */ {
- hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
- HVMMEM_ram_ro : HVMMEM_ram_rw;
- if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
- DPRINTF("unable to change ro/rw state of ROM memory area!\n");
- } else {
+ case 0: /* Platform flags */
+ if (xen_mode == XEN_EMULATE) {
+ /* XX: Use i440gx/q35 PAM setup to do this? */
s->flags = val & PFFLAG_ROM_LOCK;
- DPRINTF("changed ro/rw state of ROM memory area. now is %s state.\n",
- (mem_type == HVMMEM_ram_ro ? "ro":"rw"));
+#ifdef CONFIG_XEN
+ } else {
+ hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
+ HVMMEM_ram_ro : HVMMEM_ram_rw;
+
+ if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
+ DPRINTF("unable to change ro/rw state of ROM memory area!\n");
+ } else {
+ s->flags = val & PFFLAG_ROM_LOCK;
+ DPRINTF("changed ro/rw state of ROM memory area. now is %s state.\n",
+ (mem_type == HVMMEM_ram_ro ? "ro" : "rw"));
+ }
+#endif
}
break;
- }
+
case 2:
log_writeb(s, val);
break;
@@ -496,8 +521,8 @@ static void xen_platform_realize(PCIDevice *dev, Error **errp)
uint8_t *pci_conf;
/* Device will crash on reset if xen is not initialized */
- if (!xen_enabled()) {
- error_setg(errp, "xen-platform device requires the Xen accelerator");
+ if (xen_mode == XEN_DISABLED) {
+ error_setg(errp, "xen-platform device requires a Xen guest");
return;
}
diff --git a/hw/pci/msi.c b/hw/pci/msi.c
index 1cadf15..041b0bd 100644
--- a/hw/pci/msi.c
+++ b/hw/pci/msi.c
@@ -24,6 +24,8 @@
#include "qemu/range.h"
#include "qapi/error.h"
+#include "hw/i386/kvm/xen_evtchn.h"
+
/* PCI_MSI_ADDRESS_LO */
#define PCI_MSI_ADDRESS_LO_MASK (~0x3)
@@ -414,6 +416,15 @@ void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len)
fprintf(stderr, "\n");
#endif
+ if (xen_mode == XEN_EMULATE) {
+ for (vector = 0; vector < msi_nr_vectors(flags); vector++) {
+ MSIMessage msg = msi_prepare_message(dev, vector);
+
+ xen_evtchn_snoop_msi(dev, false, vector, msg.address, msg.data,
+ msi_is_masked(dev, vector));
+ }
+ }
+
if (!(flags & PCI_MSI_FLAGS_ENABLE)) {
return;
}
diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index 9e70fcd..ab8869d 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -26,6 +26,8 @@
#include "qapi/error.h"
#include "trace.h"
+#include "hw/i386/kvm/xen_evtchn.h"
+
/* MSI enable bit and maskall bit are in byte 1 in FLAGS register */
#define MSIX_CONTROL_OFFSET (PCI_MSIX_FLAGS + 1)
#define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
@@ -124,6 +126,13 @@ static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked)
{
bool is_masked = msix_is_masked(dev, vector);
+ if (xen_mode == XEN_EMULATE) {
+ MSIMessage msg = msix_prepare_message(dev, vector);
+
+ xen_evtchn_snoop_msi(dev, true, vector, msg.address, msg.data,
+ is_masked);
+ }
+
if (is_masked == was_masked) {
return;
}
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index bad8e63..10c980b 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -49,6 +49,9 @@
#include "qemu/cutils.h"
#include "pci-internal.h"
+#include "hw/xen/xen.h"
+#include "hw/i386/kvm/xen_evtchn.h"
+
//#define DEBUG_PCI
#ifdef DEBUG_PCI
# define PCI_DPRINTF(format, ...) printf(format, ## __VA_ARGS__)
@@ -319,6 +322,17 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg)
{
MemTxAttrs attrs = {};
+ /*
+ * Xen uses the high bits of the address to contain some of the bits
+ * of the PIRQ#. Therefore we can't just send the write cycle and
+ * trust that it's caught by the APIC at 0xfee00000 because the
+ * target of the write might be e.g. 0x0x1000fee46000 for PIRQ#4166.
+ * So we intercept the delivery here instead of in kvm_send_msi().
+ */
+ if (xen_mode == XEN_EMULATE &&
+ xen_evtchn_deliver_pirq_msi(msg.address, msg.data)) {
+ return;
+ }
attrs.requester_id = pci_requester_id(dev);
address_space_stl_le(&dev->bus_master_as, msg.address, msg.data,
attrs, NULL);
@@ -988,6 +1002,9 @@ static void do_pci_unregister_device(PCIDevice *pci_dev)
pci_get_bus(pci_dev)->devices[pci_dev->devfn] = NULL;
pci_config_free(pci_dev);
+ if (xen_mode == XEN_EMULATE) {
+ xen_evtchn_remove_pci_device(pci_dev);
+ }
if (memory_region_is_mapped(&pci_dev->bus_master_enable_region)) {
memory_region_del_subregion(&pci_dev->bus_master_container_region,
&pci_dev->bus_master_enable_region);
diff --git a/hw/xen/Kconfig b/hw/xen/Kconfig
new file mode 100644
index 0000000..3467efb
--- /dev/null
+++ b/hw/xen/Kconfig
@@ -0,0 +1,3 @@
+config XEN_BUS
+ bool
+ default y if (XEN || XEN_EMU)
diff --git a/hw/xen/xen-legacy-backend.c b/hw/xen/xen-legacy-backend.c
index 085fd31..afba71f 100644
--- a/hw/xen/xen-legacy-backend.c
+++ b/hw/xen/xen-legacy-backend.c
@@ -676,21 +676,30 @@ void xenstore_update_fe(char *watch, struct XenLegacyDevice *xendev)
}
/* -------------------------------------------------------------------- */
-int xen_be_init(void)
+static void xen_set_dynamic_sysbus(void)
+{
+ Object *machine = qdev_get_machine();
+ ObjectClass *oc = object_get_class(machine);
+ MachineClass *mc = MACHINE_CLASS(oc);
+
+ machine_class_allow_dynamic_sysbus_dev(mc, TYPE_XENSYSDEV);
+}
+
+void xen_be_init(void)
{
xengnttab_handle *gnttabdev;
xenstore = xs_daemon_open();
if (!xenstore) {
xen_pv_printf(NULL, 0, "can't connect to xenstored\n");
- return -1;
+ exit(1);
}
qemu_set_fd_handler(xs_fileno(xenstore), xenstore_update, NULL, NULL);
if (xen_xc == NULL || xen_fmem == NULL) {
- /* Check if xen_init() have been called */
- goto err;
+ xen_pv_printf(NULL, 0, "Xen operations not set up\n");
+ exit(1);
}
gnttabdev = xengnttab_open(NULL, 0);
@@ -706,23 +715,16 @@ int xen_be_init(void)
xen_sysbus = qbus_new(TYPE_XENSYSBUS, xen_sysdev, "xen-sysbus");
qbus_set_bus_hotplug_handler(xen_sysbus);
- return 0;
-
-err:
- qemu_set_fd_handler(xs_fileno(xenstore), NULL, NULL, NULL);
- xs_daemon_close(xenstore);
- xenstore = NULL;
-
- return -1;
-}
-
-static void xen_set_dynamic_sysbus(void)
-{
- Object *machine = qdev_get_machine();
- ObjectClass *oc = object_get_class(machine);
- MachineClass *mc = MACHINE_CLASS(oc);
+ xen_set_dynamic_sysbus();
- machine_class_allow_dynamic_sysbus_dev(mc, TYPE_XENSYSDEV);
+ xen_be_register("console", &xen_console_ops);
+ xen_be_register("vkbd", &xen_kbdmouse_ops);
+#ifdef CONFIG_VIRTFS
+ xen_be_register("9pfs", &xen_9pfs_ops);
+#endif
+#ifdef CONFIG_USB_LIBUSB
+ xen_be_register("qusb", &xen_usb_ops);
+#endif
}
int xen_be_register(const char *type, struct XenDevOps *ops)
@@ -744,20 +746,6 @@ int xen_be_register(const char *type, struct XenDevOps *ops)
return xenstore_scan(type, xen_domid, ops);
}
-void xen_be_register_common(void)
-{
- xen_set_dynamic_sysbus();
-
- xen_be_register("console", &xen_console_ops);
- xen_be_register("vkbd", &xen_kbdmouse_ops);
-#ifdef CONFIG_VIRTFS
- xen_be_register("9pfs", &xen_9pfs_ops);
-#endif
-#ifdef CONFIG_USB_LIBUSB
- xen_be_register("qusb", &xen_usb_ops);
-#endif
-}
-
int xen_be_bind_evtchn(struct XenLegacyDevice *xendev)
{
if (xendev->local_port != -1) {
diff --git a/hw/xenpv/xen_machine_pv.c b/hw/xenpv/xen_machine_pv.c
index 20c9611..2e759d0 100644
--- a/hw/xenpv/xen_machine_pv.c
+++ b/hw/xenpv/xen_machine_pv.c
@@ -36,10 +36,7 @@ static void xen_init_pv(MachineState *machine)
int i;
/* Initialize backend core & drivers */
- if (xen_be_init() != 0) {
- error_report("%s: xen backend core setup failed", __func__);
- exit(1);
- }
+ xen_be_init();
switch (xen_mode) {
case XEN_ATTACH:
@@ -55,7 +52,6 @@ static void xen_init_pv(MachineState *machine)
break;
}
- xen_be_register_common();
xen_be_register("vfb", &xen_framebuffer_ops);
xen_be_register("qnic", &xen_netdev_ops);