diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2023-03-02 16:13:45 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2023-03-02 16:13:45 +0000 |
commit | c61d1a066cb6cf90662c82d0e35660fc0ccacbaf (patch) | |
tree | 731e3a9a7319be047824ff8d87445ad5d343f950 /hw | |
parent | 262312d7ba6e2966acedb4f9c134fd19176b4083 (diff) | |
parent | 526947e496e4447d74b8d42415e2847481c5043d (diff) | |
download | qemu-c61d1a066cb6cf90662c82d0e35660fc0ccacbaf.zip qemu-c61d1a066cb6cf90662c82d0e35660fc0ccacbaf.tar.gz qemu-c61d1a066cb6cf90662c82d0e35660fc0ccacbaf.tar.bz2 |
Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging
* bugfixes
* show machine ACPI support in QAPI
* Core Xen emulation support for KVM/x86
# -----BEGIN PGP SIGNATURE-----
#
# iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmQAlrYUHHBib256aW5p
# QHJlZGhhdC5jb20ACgkQv/vSX3jHroONWwf/fxDUMcZUvvatNxiVMhNfqEt/cL0F
# Durv1PmbbeVh9PP0W7XFkEXO3LCIRDyR4rtmCs7gHGdmzDOWQ+QIWgQijQ/y7ElQ
# bTVsvs0+s/6H3csP3dJTJaXSHshbQvrAZTsyk5KcAB6xdL1KqulfLUoGvXJhAmRs
# NKZN8un+nuAhFhL0VBWA9eQaP+BVHQI5ItAj8PaoBby4+Q9fNnat6j1/G4iLly8J
# dxIwCnuRHLiB3melWtadwbv6ddLJFeZNa50HUIsynqoItTzmRVr+oXz1yfq087dB
# 9uksmoqb+icGEdwqs0iYbQ/dhVnIrMDpn/n2Us28S5VdIMVvxr1JEbEkSQ==
# =0jY8
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 02 Mar 2023 12:29:42 GMT
# gpg: using RSA key F13338574B662389866C7682BFFBD25F78C7AE83
# gpg: issuer "pbonzini@redhat.com"
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full]
# gpg: aka "Paolo Bonzini <pbonzini@redhat.com>" [full]
# Primary key fingerprint: 46F5 9FBD 57D6 12E7 BFD4 E2F7 7E15 100C CD36 69B1
# Subkey fingerprint: F133 3857 4B66 2389 866C 7682 BFFB D25F 78C7 AE83
* tag 'for-upstream' of https://gitlab.com/bonzini/qemu: (62 commits)
Makefile: qemu-bundle is a directory
qapi: Add 'acpi' field to 'query-machines' output
hw/xen: Subsume xen_be_register_common() into xen_be_init()
i386/xen: Document Xen HVM emulation
kvm/i386: Add xen-evtchn-max-pirq property
hw/xen: Support MSI mapping to PIRQ
hw/xen: Support GSI mapping to PIRQ
hw/xen: Implement emulated PIRQ hypercall support
i386/xen: Implement HYPERVISOR_physdev_op
hw/xen: Automatically add xen-platform PCI device for emulated Xen guests
hw/xen: Add basic ring handling to xenstore
hw/xen: Add xen_xenstore device for xenstore emulation
hw/xen: Add backend implementation of interdomain event channel support
i386/xen: handle HVMOP_get_param
i386/xen: Reserve Xen special pages for console, xenstore rings
i386/xen: handle PV timer hypercalls
hw/xen: Implement GNTTABOP_query_size
i386/xen: Implement HYPERVISOR_grant_table_op and GNTTABOP_[gs]et_verson
hw/xen: Support mapping grant frames
hw/xen: Add xen_gnttab device for grant table emulation
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/Kconfig | 1 | ||||
-rw-r--r-- | hw/core/machine-qmp-cmds.c | 1 | ||||
-rw-r--r-- | hw/i386/Kconfig | 5 | ||||
-rw-r--r-- | hw/i386/kvm/meson.build | 13 | ||||
-rw-r--r-- | hw/i386/kvm/trace-events | 5 | ||||
-rw-r--r-- | hw/i386/kvm/trace.h | 1 | ||||
-rw-r--r-- | hw/i386/kvm/xen-stubs.c | 44 | ||||
-rw-r--r-- | hw/i386/kvm/xen_evtchn.c | 2341 | ||||
-rw-r--r-- | hw/i386/kvm/xen_evtchn.h | 88 | ||||
-rw-r--r-- | hw/i386/kvm/xen_gnttab.c | 232 | ||||
-rw-r--r-- | hw/i386/kvm/xen_gnttab.h | 25 | ||||
-rw-r--r-- | hw/i386/kvm/xen_overlay.c | 272 | ||||
-rw-r--r-- | hw/i386/kvm/xen_overlay.h | 26 | ||||
-rw-r--r-- | hw/i386/kvm/xen_xenstore.c | 500 | ||||
-rw-r--r-- | hw/i386/kvm/xen_xenstore.h | 20 | ||||
-rw-r--r-- | hw/i386/pc.c | 26 | ||||
-rw-r--r-- | hw/i386/x86.c | 16 | ||||
-rw-r--r-- | hw/i386/xen/meson.build | 5 | ||||
-rw-r--r-- | hw/i386/xen/xen-hvm.c | 8 | ||||
-rw-r--r-- | hw/i386/xen/xen_platform.c | 57 | ||||
-rw-r--r-- | hw/pci/msi.c | 11 | ||||
-rw-r--r-- | hw/pci/msix.c | 9 | ||||
-rw-r--r-- | hw/pci/pci.c | 17 | ||||
-rw-r--r-- | hw/xen/Kconfig | 3 | ||||
-rw-r--r-- | hw/xen/xen-legacy-backend.c | 56 | ||||
-rw-r--r-- | hw/xenpv/xen_machine_pv.c | 6 |
26 files changed, 3725 insertions, 63 deletions
@@ -41,6 +41,7 @@ source tpm/Kconfig source usb/Kconfig source virtio/Kconfig source vfio/Kconfig +source xen/Kconfig source watchdog/Kconfig # arch Kconfig diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c index 2d90474..b98ff15 100644 --- a/hw/core/machine-qmp-cmds.c +++ b/hw/core/machine-qmp-cmds.c @@ -102,6 +102,7 @@ MachineInfoList *qmp_query_machines(Error **errp) info->hotpluggable_cpus = mc->has_hotpluggable_cpus; info->numa_mem_supported = mc->numa_mem_supported; info->deprecated = !!mc->deprecation_reason; + info->acpi = !!object_class_property_find(OBJECT_CLASS(mc), "acpi"); if (mc->default_cpu_type) { info->default_cpu_type = g_strdup(mc->default_cpu_type); } diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 9fbfe74..d40802d 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -136,3 +136,8 @@ config VMPORT config VMMOUSE bool depends on VMPORT + +config XEN_EMU + bool + default y + depends on KVM && (I386 || X86_64) diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build index 95467f1..82dd6ae 100644 --- a/hw/i386/kvm/meson.build +++ b/hw/i386/kvm/meson.build @@ -4,5 +4,18 @@ i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c')) i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c')) i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c')) i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c')) +i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files( + 'xen_overlay.c', + 'xen_evtchn.c', + 'xen_gnttab.c', + 'xen_xenstore.c', + )) i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss) + +xen_stubs_ss = ss.source_set() +xen_stubs_ss.add(when: 'CONFIG_XEN_EMU', if_false: files( + 'xen-stubs.c', +)) + +specific_ss.add_all(when: 'CONFIG_SOFTMMU', if_true: xen_stubs_ss) diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events new file mode 100644 index 0000000..b83c3eb --- /dev/null +++ b/hw/i386/kvm/trace-events @@ -0,0 +1,5 @@ +kvm_xen_map_pirq(int pirq, int gsi) "pirq %d gsi %d" +kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d" +kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d" +kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d" +kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d" diff --git a/hw/i386/kvm/trace.h b/hw/i386/kvm/trace.h new file mode 100644 index 0000000..e55d081 --- /dev/null +++ b/hw/i386/kvm/trace.h @@ -0,0 +1 @@ +#include "trace/trace-hw_i386_kvm.h" diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c new file mode 100644 index 0000000..ae406e0 --- /dev/null +++ b/hw/i386/kvm/xen-stubs.c @@ -0,0 +1,44 @@ +/* + * QEMU Xen emulation: QMP stubs + * + * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qapi/qapi-commands-misc-target.h" + +#include "xen_evtchn.h" + +void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector, + uint64_t addr, uint32_t data, bool is_masked) +{ +} + +void xen_evtchn_remove_pci_device(PCIDevice *dev) +{ +} + +bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data) +{ + return false; +} + +#ifdef TARGET_I386 +EvtchnInfoList *qmp_xen_event_list(Error **errp) +{ + error_setg(errp, "Xen event channel emulation not enabled"); + return NULL; +} + +void qmp_xen_event_inject(uint32_t port, Error **errp) +{ + error_setg(errp, "Xen event channel emulation not enabled"); +} +#endif diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c new file mode 100644 index 0000000..886fbf6 --- /dev/null +++ b/hw/i386/kvm/xen_evtchn.c @@ -0,0 +1,2341 @@ +/* + * QEMU Xen emulation: Event channel support + * + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include "qemu/module.h" +#include "qemu/lockable.h" +#include "qemu/main-loop.h" +#include "qemu/log.h" +#include "monitor/monitor.h" +#include "monitor/hmp.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-misc-target.h" +#include "qapi/qmp/qdict.h" +#include "qom/object.h" +#include "exec/target_page.h" +#include "exec/address-spaces.h" +#include "migration/vmstate.h" +#include "trace.h" + +#include "hw/sysbus.h" +#include "hw/xen/xen.h" +#include "hw/i386/x86.h" +#include "hw/i386/pc.h" +#include "hw/pci/pci.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "hw/irq.h" + +#include "xen_evtchn.h" +#include "xen_overlay.h" +#include "xen_xenstore.h" + +#include "sysemu/kvm.h" +#include "sysemu/kvm_xen.h" +#include <linux/kvm.h> +#include <sys/eventfd.h> + +#include "hw/xen/interface/memory.h" +#include "hw/xen/interface/hvm/params.h" + +/* XX: For kvm_update_msi_routes_all() */ +#include "target/i386/kvm/kvm_i386.h" + +#define TYPE_XEN_EVTCHN "xen-evtchn" +OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN) + +typedef struct XenEvtchnPort { + uint32_t vcpu; /* Xen/ACPI vcpu_id */ + uint16_t type; /* EVTCHNSTAT_xxxx */ + uint16_t type_val; /* pirq# / virq# / remote port according to type */ +} XenEvtchnPort; + +/* 32-bit compatibility definitions, also used natively in 32-bit build */ +struct compat_arch_vcpu_info { + unsigned int cr2; + unsigned int pad[5]; +}; + +struct compat_vcpu_info { + uint8_t evtchn_upcall_pending; + uint8_t evtchn_upcall_mask; + uint16_t pad; + uint32_t evtchn_pending_sel; + struct compat_arch_vcpu_info arch; + struct vcpu_time_info time; +}; /* 64 bytes (x86) */ + +struct compat_arch_shared_info { + unsigned int max_pfn; + unsigned int pfn_to_mfn_frame_list_list; + unsigned int nmi_reason; + unsigned int p2m_cr3; + unsigned int p2m_vaddr; + unsigned int p2m_generation; + uint32_t wc_sec_hi; +}; + +struct compat_shared_info { + struct compat_vcpu_info vcpu_info[XEN_LEGACY_MAX_VCPUS]; + uint32_t evtchn_pending[32]; + uint32_t evtchn_mask[32]; + uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ + uint32_t wc_sec; + uint32_t wc_nsec; + struct compat_arch_shared_info arch; +}; + +#define COMPAT_EVTCHN_2L_NR_CHANNELS 1024 + +/* Local private implementation of struct xenevtchn_handle */ +struct xenevtchn_handle { + evtchn_port_t be_port; + evtchn_port_t guest_port; /* Or zero for unbound */ + int fd; +}; + +/* + * For unbound/interdomain ports there are only two possible remote + * domains; self and QEMU. Use a single high bit in type_val for that, + * and the low bits for the remote port number (or 0 for unbound). + */ +#define PORT_INFO_TYPEVAL_REMOTE_QEMU 0x8000 +#define PORT_INFO_TYPEVAL_REMOTE_PORT_MASK 0x7FFF + +/* + * These 'emuirq' values are used by Xen in the LM stream... and yes, I am + * insane enough to think about guest-transparent live migration from actual + * Xen to QEMU, and ensuring that we can convert/consume the stream. + */ +#define IRQ_UNBOUND -1 +#define IRQ_PT -2 +#define IRQ_MSI_EMU -3 + + +struct pirq_info { + int gsi; + uint16_t port; + PCIDevice *dev; + int vector; + bool is_msix; + bool is_masked; + bool is_translated; +}; + +struct XenEvtchnState { + /*< private >*/ + SysBusDevice busdev; + /*< public >*/ + + uint64_t callback_param; + bool evtchn_in_kernel; + uint32_t callback_gsi; + + QEMUBH *gsi_bh; + + QemuMutex port_lock; + uint32_t nr_ports; + XenEvtchnPort port_table[EVTCHN_2L_NR_CHANNELS]; + qemu_irq gsis[IOAPIC_NUM_PINS]; + + struct xenevtchn_handle *be_handles[EVTCHN_2L_NR_CHANNELS]; + + uint32_t nr_pirqs; + + /* Bitmap of allocated PIRQs (serialized) */ + uint16_t nr_pirq_inuse_words; + uint64_t *pirq_inuse_bitmap; + + /* GSI → PIRQ mapping (serialized) */ + uint16_t gsi_pirq[IOAPIC_NUM_PINS]; + + /* Per-GSI assertion state (serialized) */ + uint32_t pirq_gsi_set; + + /* Per-PIRQ information (rebuilt on migration, protected by BQL) */ + struct pirq_info *pirq; +}; + +#define pirq_inuse_word(s, pirq) (s->pirq_inuse_bitmap[((pirq) / 64)]) +#define pirq_inuse_bit(pirq) (1ULL << ((pirq) & 63)) + +#define pirq_inuse(s, pirq) (pirq_inuse_word(s, pirq) & pirq_inuse_bit(pirq)) + +struct XenEvtchnState *xen_evtchn_singleton; + +/* Top bits of callback_param are the type (HVM_PARAM_CALLBACK_TYPE_xxx) */ +#define CALLBACK_VIA_TYPE_SHIFT 56 + +static void unbind_backend_ports(XenEvtchnState *s); + +static int xen_evtchn_pre_load(void *opaque) +{ + XenEvtchnState *s = opaque; + + /* Unbind all the backend-side ports; they need to rebind */ + unbind_backend_ports(s); + + /* It'll be leaked otherwise. */ + g_free(s->pirq_inuse_bitmap); + s->pirq_inuse_bitmap = NULL; + + return 0; +} + +static int xen_evtchn_post_load(void *opaque, int version_id) +{ + XenEvtchnState *s = opaque; + uint32_t i; + + if (s->callback_param) { + xen_evtchn_set_callback_param(s->callback_param); + } + + /* Rebuild s->pirq[].port mapping */ + for (i = 0; i < s->nr_ports; i++) { + XenEvtchnPort *p = &s->port_table[i]; + + if (p->type == EVTCHNSTAT_pirq) { + assert(p->type_val); + assert(p->type_val < s->nr_pirqs); + + /* + * Set the gsi to IRQ_UNBOUND; it may be changed to an actual + * GSI# below, or to IRQ_MSI_EMU when the MSI table snooping + * catches up with it. + */ + s->pirq[p->type_val].gsi = IRQ_UNBOUND; + s->pirq[p->type_val].port = i; + } + } + /* Rebuild s->pirq[].gsi mapping */ + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + if (s->gsi_pirq[i]) { + s->pirq[s->gsi_pirq[i]].gsi = i; + } + } + return 0; +} + +static bool xen_evtchn_is_needed(void *opaque) +{ + return xen_mode == XEN_EMULATE; +} + +static const VMStateDescription xen_evtchn_port_vmstate = { + .name = "xen_evtchn_port", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(vcpu, XenEvtchnPort), + VMSTATE_UINT16(type, XenEvtchnPort), + VMSTATE_UINT16(type_val, XenEvtchnPort), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription xen_evtchn_vmstate = { + .name = "xen_evtchn", + .version_id = 1, + .minimum_version_id = 1, + .needed = xen_evtchn_is_needed, + .pre_load = xen_evtchn_pre_load, + .post_load = xen_evtchn_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT64(callback_param, XenEvtchnState), + VMSTATE_UINT32(nr_ports, XenEvtchnState), + VMSTATE_STRUCT_VARRAY_UINT32(port_table, XenEvtchnState, nr_ports, 1, + xen_evtchn_port_vmstate, XenEvtchnPort), + VMSTATE_UINT16_ARRAY(gsi_pirq, XenEvtchnState, IOAPIC_NUM_PINS), + VMSTATE_VARRAY_UINT16_ALLOC(pirq_inuse_bitmap, XenEvtchnState, + nr_pirq_inuse_words, 0, + vmstate_info_uint64, uint64_t), + VMSTATE_UINT32(pirq_gsi_set, XenEvtchnState), + VMSTATE_END_OF_LIST() + } +}; + +static void xen_evtchn_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->vmsd = &xen_evtchn_vmstate; +} + +static const TypeInfo xen_evtchn_info = { + .name = TYPE_XEN_EVTCHN, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(XenEvtchnState), + .class_init = xen_evtchn_class_init, +}; + +static void gsi_assert_bh(void *opaque) +{ + struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0); + if (vi) { + xen_evtchn_set_callback_level(!!vi->evtchn_upcall_pending); + } +} + +void xen_evtchn_create(void) +{ + XenEvtchnState *s = XEN_EVTCHN(sysbus_create_simple(TYPE_XEN_EVTCHN, + -1, NULL)); + int i; + + xen_evtchn_singleton = s; + + qemu_mutex_init(&s->port_lock); + s->gsi_bh = aio_bh_new(qemu_get_aio_context(), gsi_assert_bh, s); + + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + sysbus_init_irq(SYS_BUS_DEVICE(s), &s->gsis[i]); + } + + /* + * The Xen scheme for encoding PIRQ# into an MSI message is not + * compatible with 32-bit MSI, as it puts the high bits of the + * PIRQ# into the high bits of the MSI message address, instead of + * using the Extended Destination ID in address bits 4-11 which + * perhaps would have been a better choice. + * + * To keep life simple, kvm_accel_instance_init() initialises the + * default to 256. which conveniently doesn't need to set anything + * outside the low 32 bits of the address. It can be increased by + * setting the xen-evtchn-max-pirq property. + */ + s->nr_pirqs = kvm_xen_get_evtchn_max_pirq(); + + s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64); + s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words); + s->pirq = g_new0(struct pirq_info, s->nr_pirqs); +} + +void xen_evtchn_connect_gsis(qemu_irq *system_gsis) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int i; + + if (!s) { + return; + } + + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + sysbus_connect_irq(SYS_BUS_DEVICE(s), i, system_gsis[i]); + } +} + +static void xen_evtchn_register_types(void) +{ + type_register_static(&xen_evtchn_info); +} + +type_init(xen_evtchn_register_types) + +static int set_callback_pci_intx(XenEvtchnState *s, uint64_t param) +{ + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + uint8_t pin = param & 3; + uint8_t devfn = (param >> 8) & 0xff; + uint16_t bus = (param >> 16) & 0xffff; + uint16_t domain = (param >> 32) & 0xffff; + PCIDevice *pdev; + PCIINTxRoute r; + + if (domain || !pcms) { + return 0; + } + + pdev = pci_find_device(pcms->bus, bus, devfn); + if (!pdev) { + return 0; + } + + r = pci_device_route_intx_to_irq(pdev, pin); + if (r.mode != PCI_INTX_ENABLED) { + return 0; + } + + /* + * Hm, can we be notified of INTX routing changes? Not without + * *owning* the device and being allowed to overwrite its own + * ->intx_routing_notifier, AFAICT. So let's not. + */ + return r.irq; +} + +void xen_evtchn_set_callback_level(int level) +{ + XenEvtchnState *s = xen_evtchn_singleton; + if (!s) { + return; + } + + /* + * We get to this function in a number of ways: + * + * • From I/O context, via PV backend drivers sending a notification to + * the guest. + * + * • From guest vCPU context, via loopback interdomain event channels + * (or theoretically even IPIs but guests don't use those with GSI + * delivery because that's pointless. We don't want a malicious guest + * to be able to trigger a deadlock though, so we can't rule it out.) + * + * • From guest vCPU context when the HVM_PARAM_CALLBACK_IRQ is being + * configured. + * + * • From guest vCPU context in the KVM exit handler, if the upcall + * pending flag has been cleared and the GSI needs to be deasserted. + * + * • Maybe in future, in an interrupt ack/eoi notifier when the GSI has + * been acked in the irqchip. + * + * Whichever context we come from if we aren't already holding the BQL + * then e can't take it now, as we may already hold s->port_lock. So + * trigger the BH to set the IRQ for us instead of doing it immediately. + * + * In the HVM_PARAM_CALLBACK_IRQ and KVM exit handler cases, the caller + * will deliberately take the BQL because they want the change to take + * effect immediately. That just leaves interdomain loopback as the case + * which uses the BH. + */ + if (!qemu_mutex_iothread_locked()) { + qemu_bh_schedule(s->gsi_bh); + return; + } + + if (s->callback_gsi && s->callback_gsi < IOAPIC_NUM_PINS) { + qemu_set_irq(s->gsis[s->callback_gsi], level); + if (level) { + /* Ensure the vCPU polls for deassertion */ + kvm_xen_set_callback_asserted(); + } + } +} + +int xen_evtchn_set_callback_param(uint64_t param) +{ + XenEvtchnState *s = xen_evtchn_singleton; + struct kvm_xen_hvm_attr xa = { + .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR, + .u.vector = 0, + }; + bool in_kernel = false; + uint32_t gsi = 0; + int type = param >> CALLBACK_VIA_TYPE_SHIFT; + int ret; + + if (!s) { + return -ENOTSUP; + } + + /* + * We need the BQL because set_callback_pci_intx() may call into PCI code, + * and because we may need to manipulate the old and new GSI levels. + */ + assert(qemu_mutex_iothread_locked()); + qemu_mutex_lock(&s->port_lock); + + switch (type) { + case HVM_PARAM_CALLBACK_TYPE_VECTOR: { + xa.u.vector = (uint8_t)param, + + ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa); + if (!ret && kvm_xen_has_cap(EVTCHN_SEND)) { + in_kernel = true; + } + gsi = 0; + break; + } + + case HVM_PARAM_CALLBACK_TYPE_PCI_INTX: + gsi = set_callback_pci_intx(s, param); + ret = gsi ? 0 : -EINVAL; + break; + + case HVM_PARAM_CALLBACK_TYPE_GSI: + gsi = (uint32_t)param; + ret = 0; + break; + + default: + /* Xen doesn't return error even if you set something bogus */ + ret = 0; + break; + } + + if (!ret) { + /* If vector delivery was turned *off* then tell the kernel */ + if ((s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) == + HVM_PARAM_CALLBACK_TYPE_VECTOR && !xa.u.vector) { + kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa); + } + s->callback_param = param; + s->evtchn_in_kernel = in_kernel; + + if (gsi != s->callback_gsi) { + struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0); + + xen_evtchn_set_callback_level(0); + s->callback_gsi = gsi; + + if (gsi && vi && vi->evtchn_upcall_pending) { + kvm_xen_inject_vcpu_callback_vector(0, type); + } + } + } + + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +static void inject_callback(XenEvtchnState *s, uint32_t vcpu) +{ + int type = s->callback_param >> CALLBACK_VIA_TYPE_SHIFT; + + kvm_xen_inject_vcpu_callback_vector(vcpu, type); +} + +static void deassign_kernel_port(evtchn_port_t port) +{ + struct kvm_xen_hvm_attr ha; + int ret; + + ha.type = KVM_XEN_ATTR_TYPE_EVTCHN; + ha.u.evtchn.send_port = port; + ha.u.evtchn.flags = KVM_XEN_EVTCHN_DEASSIGN; + + ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha); + if (ret) { + qemu_log_mask(LOG_GUEST_ERROR, "Failed to unbind kernel port %d: %s\n", + port, strerror(ret)); + } +} + +static int assign_kernel_port(uint16_t type, evtchn_port_t port, + uint32_t vcpu_id) +{ + CPUState *cpu = qemu_get_cpu(vcpu_id); + struct kvm_xen_hvm_attr ha; + + if (!cpu) { + return -ENOENT; + } + + ha.type = KVM_XEN_ATTR_TYPE_EVTCHN; + ha.u.evtchn.send_port = port; + ha.u.evtchn.type = type; + ha.u.evtchn.flags = 0; + ha.u.evtchn.deliver.port.port = port; + ha.u.evtchn.deliver.port.vcpu = kvm_arch_vcpu_id(cpu); + ha.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha); +} + +static int assign_kernel_eventfd(uint16_t type, evtchn_port_t port, int fd) +{ + struct kvm_xen_hvm_attr ha; + + ha.type = KVM_XEN_ATTR_TYPE_EVTCHN; + ha.u.evtchn.send_port = port; + ha.u.evtchn.type = type; + ha.u.evtchn.flags = 0; + ha.u.evtchn.deliver.eventfd.port = 0; + ha.u.evtchn.deliver.eventfd.fd = fd; + + return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha); +} + +static bool valid_port(evtchn_port_t port) +{ + if (!port) { + return false; + } + + if (xen_is_long_mode()) { + return port < EVTCHN_2L_NR_CHANNELS; + } else { + return port < COMPAT_EVTCHN_2L_NR_CHANNELS; + } +} + +static bool valid_vcpu(uint32_t vcpu) +{ + return !!qemu_get_cpu(vcpu); +} + +static void unbind_backend_ports(XenEvtchnState *s) +{ + XenEvtchnPort *p; + int i; + + for (i = 1; i < s->nr_ports; i++) { + p = &s->port_table[i]; + if (p->type == EVTCHNSTAT_interdomain && + (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU)) { + evtchn_port_t be_port = p->type_val & PORT_INFO_TYPEVAL_REMOTE_PORT_MASK; + + if (s->be_handles[be_port]) { + /* This part will be overwritten on the load anyway. */ + p->type = EVTCHNSTAT_unbound; + p->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU; + + /* Leave the backend port open and unbound too. */ + if (kvm_xen_has_cap(EVTCHN_SEND)) { + deassign_kernel_port(i); + } + s->be_handles[be_port]->guest_port = 0; + } + } + } +} + +int xen_evtchn_status_op(struct evtchn_status *status) +{ + XenEvtchnState *s = xen_evtchn_singleton; + XenEvtchnPort *p; + + if (!s) { + return -ENOTSUP; + } + + if (status->dom != DOMID_SELF && status->dom != xen_domid) { + return -ESRCH; + } + + if (!valid_port(status->port)) { + return -EINVAL; + } + + qemu_mutex_lock(&s->port_lock); + + p = &s->port_table[status->port]; + + status->status = p->type; + status->vcpu = p->vcpu; + + switch (p->type) { + case EVTCHNSTAT_unbound: + if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) { + status->u.unbound.dom = DOMID_QEMU; + } else { + status->u.unbound.dom = xen_domid; + } + break; + + case EVTCHNSTAT_interdomain: + if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) { + status->u.interdomain.dom = DOMID_QEMU; + } else { + status->u.interdomain.dom = xen_domid; + } + + status->u.interdomain.port = p->type_val & + PORT_INFO_TYPEVAL_REMOTE_PORT_MASK; + break; + + case EVTCHNSTAT_pirq: + status->u.pirq = p->type_val; + break; + + case EVTCHNSTAT_virq: + status->u.virq = p->type_val; + break; + } + + qemu_mutex_unlock(&s->port_lock); + return 0; +} + +/* + * Never thought I'd hear myself say this, but C++ templates would be + * kind of nice here. + * + * template<class T> static int do_unmask_port(T *shinfo, ...); + */ +static int do_unmask_port_lm(XenEvtchnState *s, evtchn_port_t port, + bool do_unmask, struct shared_info *shinfo, + struct vcpu_info *vcpu_info) +{ + const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]); + typeof(shinfo->evtchn_pending[0]) mask; + int idx = port / bits_per_word; + int offset = port % bits_per_word; + + mask = 1UL << offset; + + if (idx >= bits_per_word) { + return -EINVAL; + } + + if (do_unmask) { + /* + * If this is a true unmask operation, clear the mask bit. If + * it was already unmasked, we have nothing further to do. + */ + if (!((qatomic_fetch_and(&shinfo->evtchn_mask[idx], ~mask) & mask))) { + return 0; + } + } else { + /* + * This is a pseudo-unmask for affinity changes. We don't + * change the mask bit, and if it's *masked* we have nothing + * else to do. + */ + if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) { + return 0; + } + } + + /* If the event was not pending, we're done. */ + if (!(qatomic_fetch_or(&shinfo->evtchn_pending[idx], 0) & mask)) { + return 0; + } + + /* Now on to the vcpu_info evtchn_pending_sel index... */ + mask = 1UL << idx; + + /* If a port in this word was already pending for this vCPU, all done. */ + if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) { + return 0; + } + + /* Set evtchn_upcall_pending for this vCPU */ + if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) { + return 0; + } + + inject_callback(s, s->port_table[port].vcpu); + + return 0; +} + +static int do_unmask_port_compat(XenEvtchnState *s, evtchn_port_t port, + bool do_unmask, + struct compat_shared_info *shinfo, + struct compat_vcpu_info *vcpu_info) +{ + const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]); + typeof(shinfo->evtchn_pending[0]) mask; + int idx = port / bits_per_word; + int offset = port % bits_per_word; + + mask = 1UL << offset; + + if (idx >= bits_per_word) { + return -EINVAL; + } + + if (do_unmask) { + /* + * If this is a true unmask operation, clear the mask bit. If + * it was already unmasked, we have nothing further to do. + */ + if (!((qatomic_fetch_and(&shinfo->evtchn_mask[idx], ~mask) & mask))) { + return 0; + } + } else { + /* + * This is a pseudo-unmask for affinity changes. We don't + * change the mask bit, and if it's *masked* we have nothing + * else to do. + */ + if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) { + return 0; + } + } + + /* If the event was not pending, we're done. */ + if (!(qatomic_fetch_or(&shinfo->evtchn_pending[idx], 0) & mask)) { + return 0; + } + + /* Now on to the vcpu_info evtchn_pending_sel index... */ + mask = 1UL << idx; + + /* If a port in this word was already pending for this vCPU, all done. */ + if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) { + return 0; + } + + /* Set evtchn_upcall_pending for this vCPU */ + if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) { + return 0; + } + + inject_callback(s, s->port_table[port].vcpu); + + return 0; +} + +static int unmask_port(XenEvtchnState *s, evtchn_port_t port, bool do_unmask) +{ + void *vcpu_info, *shinfo; + + if (s->port_table[port].type == EVTCHNSTAT_closed) { + return -EINVAL; + } + + shinfo = xen_overlay_get_shinfo_ptr(); + if (!shinfo) { + return -ENOTSUP; + } + + vcpu_info = kvm_xen_get_vcpu_info_hva(s->port_table[port].vcpu); + if (!vcpu_info) { + return -EINVAL; + } + + if (xen_is_long_mode()) { + return do_unmask_port_lm(s, port, do_unmask, shinfo, vcpu_info); + } else { + return do_unmask_port_compat(s, port, do_unmask, shinfo, vcpu_info); + } +} + +static int do_set_port_lm(XenEvtchnState *s, evtchn_port_t port, + struct shared_info *shinfo, + struct vcpu_info *vcpu_info) +{ + const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]); + typeof(shinfo->evtchn_pending[0]) mask; + int idx = port / bits_per_word; + int offset = port % bits_per_word; + + mask = 1UL << offset; + + if (idx >= bits_per_word) { + return -EINVAL; + } + + /* Update the pending bit itself. If it was already set, we're done. */ + if (qatomic_fetch_or(&shinfo->evtchn_pending[idx], mask) & mask) { + return 0; + } + + /* Check if it's masked. */ + if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) { + return 0; + } + + /* Now on to the vcpu_info evtchn_pending_sel index... */ + mask = 1UL << idx; + + /* If a port in this word was already pending for this vCPU, all done. */ + if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) { + return 0; + } + + /* Set evtchn_upcall_pending for this vCPU */ + if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) { + return 0; + } + + inject_callback(s, s->port_table[port].vcpu); + + return 0; +} + +static int do_set_port_compat(XenEvtchnState *s, evtchn_port_t port, + struct compat_shared_info *shinfo, + struct compat_vcpu_info *vcpu_info) +{ + const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]); + typeof(shinfo->evtchn_pending[0]) mask; + int idx = port / bits_per_word; + int offset = port % bits_per_word; + + mask = 1UL << offset; + + if (idx >= bits_per_word) { + return -EINVAL; + } + + /* Update the pending bit itself. If it was already set, we're done. */ + if (qatomic_fetch_or(&shinfo->evtchn_pending[idx], mask) & mask) { + return 0; + } + + /* Check if it's masked. */ + if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) { + return 0; + } + + /* Now on to the vcpu_info evtchn_pending_sel index... */ + mask = 1UL << idx; + + /* If a port in this word was already pending for this vCPU, all done. */ + if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) { + return 0; + } + + /* Set evtchn_upcall_pending for this vCPU */ + if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) { + return 0; + } + + inject_callback(s, s->port_table[port].vcpu); + + return 0; +} + +static int set_port_pending(XenEvtchnState *s, evtchn_port_t port) +{ + void *vcpu_info, *shinfo; + + if (s->port_table[port].type == EVTCHNSTAT_closed) { + return -EINVAL; + } + + if (s->evtchn_in_kernel) { + XenEvtchnPort *p = &s->port_table[port]; + CPUState *cpu = qemu_get_cpu(p->vcpu); + struct kvm_irq_routing_xen_evtchn evt; + + if (!cpu) { + return 0; + } + + evt.port = port; + evt.vcpu = kvm_arch_vcpu_id(cpu); + evt.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_EVTCHN_SEND, &evt); + } + + shinfo = xen_overlay_get_shinfo_ptr(); + if (!shinfo) { + return -ENOTSUP; + } + + vcpu_info = kvm_xen_get_vcpu_info_hva(s->port_table[port].vcpu); + if (!vcpu_info) { + return -EINVAL; + } + + if (xen_is_long_mode()) { + return do_set_port_lm(s, port, shinfo, vcpu_info); + } else { + return do_set_port_compat(s, port, shinfo, vcpu_info); + } +} + +static int clear_port_pending(XenEvtchnState *s, evtchn_port_t port) +{ + void *p = xen_overlay_get_shinfo_ptr(); + + if (!p) { + return -ENOTSUP; + } + + if (xen_is_long_mode()) { + struct shared_info *shinfo = p; + const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]); + typeof(shinfo->evtchn_pending[0]) mask; + int idx = port / bits_per_word; + int offset = port % bits_per_word; + + mask = 1UL << offset; + + qatomic_fetch_and(&shinfo->evtchn_pending[idx], ~mask); + } else { + struct compat_shared_info *shinfo = p; + const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]); + typeof(shinfo->evtchn_pending[0]) mask; + int idx = port / bits_per_word; + int offset = port % bits_per_word; + + mask = 1UL << offset; + + qatomic_fetch_and(&shinfo->evtchn_pending[idx], ~mask); + } + return 0; +} + +static void free_port(XenEvtchnState *s, evtchn_port_t port) +{ + s->port_table[port].type = EVTCHNSTAT_closed; + s->port_table[port].type_val = 0; + s->port_table[port].vcpu = 0; + + if (s->nr_ports == port + 1) { + do { + s->nr_ports--; + } while (s->nr_ports && + s->port_table[s->nr_ports - 1].type == EVTCHNSTAT_closed); + } + + /* Clear pending event to avoid unexpected behavior on re-bind. */ + clear_port_pending(s, port); +} + +static int allocate_port(XenEvtchnState *s, uint32_t vcpu, uint16_t type, + uint16_t val, evtchn_port_t *port) +{ + evtchn_port_t p = 1; + + for (p = 1; valid_port(p); p++) { + if (s->port_table[p].type == EVTCHNSTAT_closed) { + s->port_table[p].vcpu = vcpu; + s->port_table[p].type = type; + s->port_table[p].type_val = val; + + *port = p; + + if (s->nr_ports < p + 1) { + s->nr_ports = p + 1; + } + + return 0; + } + } + return -ENOSPC; +} + +static bool virq_is_global(uint32_t virq) +{ + switch (virq) { + case VIRQ_TIMER: + case VIRQ_DEBUG: + case VIRQ_XENOPROF: + case VIRQ_XENPMU: + return false; + + default: + return true; + } +} + +static int close_port(XenEvtchnState *s, evtchn_port_t port, + bool *flush_kvm_routes) +{ + XenEvtchnPort *p = &s->port_table[port]; + + /* Because it *might* be a PIRQ port */ + assert(qemu_mutex_iothread_locked()); + + switch (p->type) { + case EVTCHNSTAT_closed: + return -ENOENT; + + case EVTCHNSTAT_pirq: + s->pirq[p->type_val].port = 0; + if (s->pirq[p->type_val].is_translated) { + *flush_kvm_routes = true; + } + break; + + case EVTCHNSTAT_virq: + kvm_xen_set_vcpu_virq(virq_is_global(p->type_val) ? 0 : p->vcpu, + p->type_val, 0); + break; + + case EVTCHNSTAT_ipi: + if (s->evtchn_in_kernel) { + deassign_kernel_port(port); + } + break; + + case EVTCHNSTAT_interdomain: + if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) { + uint16_t be_port = p->type_val & ~PORT_INFO_TYPEVAL_REMOTE_QEMU; + struct xenevtchn_handle *xc = s->be_handles[be_port]; + if (xc) { + if (kvm_xen_has_cap(EVTCHN_SEND)) { + deassign_kernel_port(port); + } + xc->guest_port = 0; + } + } else { + /* Loopback interdomain */ + XenEvtchnPort *rp = &s->port_table[p->type_val]; + if (!valid_port(p->type_val) || rp->type_val != port || + rp->type != EVTCHNSTAT_interdomain) { + error_report("Inconsistent state for interdomain unbind"); + } else { + /* Set the other end back to unbound */ + rp->type = EVTCHNSTAT_unbound; + rp->type_val = 0; + } + } + break; + + default: + break; + } + + free_port(s, port); + return 0; +} + +int xen_evtchn_soft_reset(void) +{ + XenEvtchnState *s = xen_evtchn_singleton; + bool flush_kvm_routes; + int i; + + if (!s) { + return -ENOTSUP; + } + + assert(qemu_mutex_iothread_locked()); + + qemu_mutex_lock(&s->port_lock); + + for (i = 0; i < s->nr_ports; i++) { + close_port(s, i, &flush_kvm_routes); + } + + qemu_mutex_unlock(&s->port_lock); + + if (flush_kvm_routes) { + kvm_update_msi_routes_all(NULL, true, 0, 0); + } + + return 0; +} + +int xen_evtchn_reset_op(struct evtchn_reset *reset) +{ + if (reset->dom != DOMID_SELF && reset->dom != xen_domid) { + return -ESRCH; + } + + return xen_evtchn_soft_reset(); +} + +int xen_evtchn_close_op(struct evtchn_close *close) +{ + XenEvtchnState *s = xen_evtchn_singleton; + bool flush_kvm_routes = false; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (!valid_port(close->port)) { + return -EINVAL; + } + + QEMU_IOTHREAD_LOCK_GUARD(); + qemu_mutex_lock(&s->port_lock); + + ret = close_port(s, close->port, &flush_kvm_routes); + + qemu_mutex_unlock(&s->port_lock); + + if (flush_kvm_routes) { + kvm_update_msi_routes_all(NULL, true, 0, 0); + } + + return ret; +} + +int xen_evtchn_unmask_op(struct evtchn_unmask *unmask) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (!valid_port(unmask->port)) { + return -EINVAL; + } + + qemu_mutex_lock(&s->port_lock); + + ret = unmask_port(s, unmask->port, true); + + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +int xen_evtchn_bind_vcpu_op(struct evtchn_bind_vcpu *vcpu) +{ + XenEvtchnState *s = xen_evtchn_singleton; + XenEvtchnPort *p; + int ret = -EINVAL; + + if (!s) { + return -ENOTSUP; + } + + if (!valid_port(vcpu->port)) { + return -EINVAL; + } + + if (!valid_vcpu(vcpu->vcpu)) { + return -ENOENT; + } + + qemu_mutex_lock(&s->port_lock); + + p = &s->port_table[vcpu->port]; + + if (p->type == EVTCHNSTAT_interdomain || + p->type == EVTCHNSTAT_unbound || + p->type == EVTCHNSTAT_pirq || + (p->type == EVTCHNSTAT_virq && virq_is_global(p->type_val))) { + /* + * unmask_port() with do_unmask==false will just raise the event + * on the new vCPU if the port was already pending. + */ + p->vcpu = vcpu->vcpu; + unmask_port(s, vcpu->port, false); + ret = 0; + } + + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (virq->virq >= NR_VIRQS) { + return -EINVAL; + } + + /* Global VIRQ must be allocated on vCPU0 first */ + if (virq_is_global(virq->virq) && virq->vcpu != 0) { + return -EINVAL; + } + + if (!valid_vcpu(virq->vcpu)) { + return -ENOENT; + } + + qemu_mutex_lock(&s->port_lock); + + ret = allocate_port(s, virq->vcpu, EVTCHNSTAT_virq, virq->virq, + &virq->port); + if (!ret) { + ret = kvm_xen_set_vcpu_virq(virq->vcpu, virq->virq, virq->port); + if (ret) { + free_port(s, virq->port); + } + } + + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +int xen_evtchn_bind_pirq_op(struct evtchn_bind_pirq *pirq) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (pirq->pirq >= s->nr_pirqs) { + return -EINVAL; + } + + QEMU_IOTHREAD_LOCK_GUARD(); + + if (s->pirq[pirq->pirq].port) { + return -EBUSY; + } + + qemu_mutex_lock(&s->port_lock); + + ret = allocate_port(s, 0, EVTCHNSTAT_pirq, pirq->pirq, + &pirq->port); + if (ret) { + qemu_mutex_unlock(&s->port_lock); + return ret; + } + + s->pirq[pirq->pirq].port = pirq->port; + trace_kvm_xen_bind_pirq(pirq->pirq, pirq->port); + + qemu_mutex_unlock(&s->port_lock); + + /* + * Need to do the unmask outside port_lock because it may call + * back into the MSI translate function. + */ + if (s->pirq[pirq->pirq].gsi == IRQ_MSI_EMU) { + if (s->pirq[pirq->pirq].is_masked) { + PCIDevice *dev = s->pirq[pirq->pirq].dev; + int vector = s->pirq[pirq->pirq].vector; + char *dev_path = qdev_get_dev_path(DEVICE(dev)); + + trace_kvm_xen_unmask_pirq(pirq->pirq, dev_path, vector); + g_free(dev_path); + + if (s->pirq[pirq->pirq].is_msix) { + msix_set_mask(dev, vector, false); + } else { + msi_set_mask(dev, vector, false, NULL); + } + } else if (s->pirq[pirq->pirq].is_translated) { + /* + * If KVM had attempted to translate this one before, make it try + * again. If we unmasked, then the notifier on the MSI(-X) vector + * will already have had the same effect. + */ + kvm_update_msi_routes_all(NULL, true, 0, 0); + } + } + + return ret; +} + +int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (!valid_vcpu(ipi->vcpu)) { + return -ENOENT; + } + + qemu_mutex_lock(&s->port_lock); + + ret = allocate_port(s, ipi->vcpu, EVTCHNSTAT_ipi, 0, &ipi->port); + if (!ret && s->evtchn_in_kernel) { + assign_kernel_port(EVTCHNSTAT_ipi, ipi->port, ipi->vcpu); + } + + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain) +{ + XenEvtchnState *s = xen_evtchn_singleton; + uint16_t type_val; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (interdomain->remote_dom == DOMID_QEMU) { + type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU; + } else if (interdomain->remote_dom == DOMID_SELF || + interdomain->remote_dom == xen_domid) { + type_val = 0; + } else { + return -ESRCH; + } + + if (!valid_port(interdomain->remote_port)) { + return -EINVAL; + } + + qemu_mutex_lock(&s->port_lock); + + /* The newly allocated port starts out as unbound */ + ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val, + &interdomain->local_port); + if (ret) { + goto out; + } + + if (interdomain->remote_dom == DOMID_QEMU) { + struct xenevtchn_handle *xc = s->be_handles[interdomain->remote_port]; + XenEvtchnPort *lp = &s->port_table[interdomain->local_port]; + + if (!xc) { + ret = -ENOENT; + goto out_free_port; + } + + if (xc->guest_port) { + ret = -EBUSY; + goto out_free_port; + } + + assert(xc->be_port == interdomain->remote_port); + xc->guest_port = interdomain->local_port; + if (kvm_xen_has_cap(EVTCHN_SEND)) { + assign_kernel_eventfd(lp->type, xc->guest_port, xc->fd); + } + lp->type = EVTCHNSTAT_interdomain; + lp->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU | interdomain->remote_port; + ret = 0; + } else { + /* Loopback */ + XenEvtchnPort *rp = &s->port_table[interdomain->remote_port]; + XenEvtchnPort *lp = &s->port_table[interdomain->local_port]; + + if (rp->type == EVTCHNSTAT_unbound && rp->type_val == 0) { + /* It's a match! */ + rp->type = EVTCHNSTAT_interdomain; + rp->type_val = interdomain->local_port; + + lp->type = EVTCHNSTAT_interdomain; + lp->type_val = interdomain->remote_port; + } else { + ret = -EINVAL; + } + } + + out_free_port: + if (ret) { + free_port(s, interdomain->local_port); + } + out: + qemu_mutex_unlock(&s->port_lock); + + return ret; + +} +int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc) +{ + XenEvtchnState *s = xen_evtchn_singleton; + uint16_t type_val; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (alloc->dom != DOMID_SELF && alloc->dom != xen_domid) { + return -ESRCH; + } + + if (alloc->remote_dom == DOMID_QEMU) { + type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU; + } else if (alloc->remote_dom == DOMID_SELF || + alloc->remote_dom == xen_domid) { + type_val = 0; + } else { + return -EPERM; + } + + qemu_mutex_lock(&s->port_lock); + + ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val, &alloc->port); + + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +int xen_evtchn_send_op(struct evtchn_send *send) +{ + XenEvtchnState *s = xen_evtchn_singleton; + XenEvtchnPort *p; + int ret = 0; + + if (!s) { + return -ENOTSUP; + } + + if (!valid_port(send->port)) { + return -EINVAL; + } + + qemu_mutex_lock(&s->port_lock); + + p = &s->port_table[send->port]; + + switch (p->type) { + case EVTCHNSTAT_interdomain: + if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) { + /* + * This is an event from the guest to qemu itself, which is + * serving as the driver domain. + */ + uint16_t be_port = p->type_val & ~PORT_INFO_TYPEVAL_REMOTE_QEMU; + struct xenevtchn_handle *xc = s->be_handles[be_port]; + if (xc) { + eventfd_write(xc->fd, 1); + ret = 0; + } else { + ret = -ENOENT; + } + } else { + /* Loopback interdomain ports; just a complex IPI */ + set_port_pending(s, p->type_val); + } + break; + + case EVTCHNSTAT_ipi: + set_port_pending(s, send->port); + break; + + case EVTCHNSTAT_unbound: + /* Xen will silently drop these */ + break; + + default: + ret = -EINVAL; + break; + } + + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +int xen_evtchn_set_port(uint16_t port) +{ + XenEvtchnState *s = xen_evtchn_singleton; + XenEvtchnPort *p; + int ret = -EINVAL; + + if (!s) { + return -ENOTSUP; + } + + if (!valid_port(port)) { + return -EINVAL; + } + + qemu_mutex_lock(&s->port_lock); + + p = &s->port_table[port]; + + /* QEMU has no business sending to anything but these */ + if (p->type == EVTCHNSTAT_virq || + (p->type == EVTCHNSTAT_interdomain && + (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU))) { + set_port_pending(s, port); + ret = 0; + } + + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +static int allocate_pirq(XenEvtchnState *s, int type, int gsi) +{ + uint16_t pirq; + + /* + * Preserve the allocation strategy that Xen has. It looks like + * we *never* give out PIRQ 0-15, we give out 16-nr_irqs_gsi only + * to GSIs (counting up from 16), and then we count backwards from + * the top for MSIs or when the GSI space is exhausted. + */ + if (type == MAP_PIRQ_TYPE_GSI) { + for (pirq = 16 ; pirq < IOAPIC_NUM_PINS; pirq++) { + if (pirq_inuse(s, pirq)) { + continue; + } + + /* Found it */ + goto found; + } + } + for (pirq = s->nr_pirqs - 1; pirq >= IOAPIC_NUM_PINS; pirq--) { + /* Skip whole words at a time when they're full */ + if (pirq_inuse_word(s, pirq) == UINT64_MAX) { + pirq &= ~63ULL; + continue; + } + if (pirq_inuse(s, pirq)) { + continue; + } + + goto found; + } + return -ENOSPC; + + found: + pirq_inuse_word(s, pirq) |= pirq_inuse_bit(pirq); + if (gsi >= 0) { + assert(gsi <= IOAPIC_NUM_PINS); + s->gsi_pirq[gsi] = pirq; + } + s->pirq[pirq].gsi = gsi; + return pirq; +} + +bool xen_evtchn_set_gsi(int gsi, int level) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int pirq; + + assert(qemu_mutex_iothread_locked()); + + if (!s || gsi < 0 || gsi > IOAPIC_NUM_PINS) { + return false; + } + + /* + * Check that that it *isn't* the event channel GSI, and thus + * that we are not recursing and it's safe to take s->port_lock. + * + * Locking aside, it's perfectly sane to bail out early for that + * special case, as it would make no sense for the event channel + * GSI to be routed back to event channels, when the delivery + * method is to raise the GSI... that recursion wouldn't *just* + * be a locking issue. + */ + if (gsi && gsi == s->callback_gsi) { + return false; + } + + QEMU_LOCK_GUARD(&s->port_lock); + + pirq = s->gsi_pirq[gsi]; + if (!pirq) { + return false; + } + + if (level) { + int port = s->pirq[pirq].port; + + s->pirq_gsi_set |= (1U << gsi); + if (port) { + set_port_pending(s, port); + } + } else { + s->pirq_gsi_set &= ~(1U << gsi); + } + return true; +} + +static uint32_t msi_pirq_target(uint64_t addr, uint32_t data) +{ + /* The vector (in low 8 bits of data) must be zero */ + if (data & 0xff) { + return 0; + } + + uint32_t pirq = (addr & 0xff000) >> 12; + pirq |= (addr >> 32) & 0xffffff00; + + return pirq; +} + +static void do_remove_pci_vector(XenEvtchnState *s, PCIDevice *dev, int vector, + int except_pirq) +{ + uint32_t pirq; + + for (pirq = 0; pirq < s->nr_pirqs; pirq++) { + /* + * We could be cleverer here, but it isn't really a fast path, and + * this trivial optimisation is enough to let us skip the big gap + * in the middle a bit quicker (in terms of both loop iterations, + * and cache lines). + */ + if (!(pirq & 63) && !(pirq_inuse_word(s, pirq))) { + pirq += 64; + continue; + } + if (except_pirq && pirq == except_pirq) { + continue; + } + if (s->pirq[pirq].dev != dev) { + continue; + } + if (vector != -1 && s->pirq[pirq].vector != vector) { + continue; + } + + /* It could theoretically be bound to a port already, but that is OK. */ + s->pirq[pirq].dev = dev; + s->pirq[pirq].gsi = IRQ_UNBOUND; + s->pirq[pirq].is_msix = false; + s->pirq[pirq].vector = 0; + s->pirq[pirq].is_masked = false; + s->pirq[pirq].is_translated = false; + } +} + +void xen_evtchn_remove_pci_device(PCIDevice *dev) +{ + XenEvtchnState *s = xen_evtchn_singleton; + + if (!s) { + return; + } + + QEMU_LOCK_GUARD(&s->port_lock); + do_remove_pci_vector(s, dev, -1, 0); +} + +void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector, + uint64_t addr, uint32_t data, bool is_masked) +{ + XenEvtchnState *s = xen_evtchn_singleton; + uint32_t pirq; + + if (!s) { + return; + } + + assert(qemu_mutex_iothread_locked()); + + pirq = msi_pirq_target(addr, data); + + /* + * The PIRQ# must be sane, and there must be an allocated PIRQ in + * IRQ_UNBOUND or IRQ_MSI_EMU state to match it. + */ + if (!pirq || pirq >= s->nr_pirqs || !pirq_inuse(s, pirq) || + (s->pirq[pirq].gsi != IRQ_UNBOUND && + s->pirq[pirq].gsi != IRQ_MSI_EMU)) { + pirq = 0; + } + + if (pirq) { + s->pirq[pirq].dev = dev; + s->pirq[pirq].gsi = IRQ_MSI_EMU; + s->pirq[pirq].is_msix = is_msix; + s->pirq[pirq].vector = vector; + s->pirq[pirq].is_masked = is_masked; + } + + /* Remove any (other) entries for this {device, vector} */ + do_remove_pci_vector(s, dev, vector, pirq); +} + +int xen_evtchn_translate_pirq_msi(struct kvm_irq_routing_entry *route, + uint64_t address, uint32_t data) +{ + XenEvtchnState *s = xen_evtchn_singleton; + uint32_t pirq, port; + CPUState *cpu; + + if (!s) { + return 1; /* Not a PIRQ */ + } + + assert(qemu_mutex_iothread_locked()); + + pirq = msi_pirq_target(address, data); + if (!pirq || pirq >= s->nr_pirqs) { + return 1; /* Not a PIRQ */ + } + + if (!kvm_xen_has_cap(EVTCHN_2LEVEL)) { + return -ENOTSUP; + } + + if (s->pirq[pirq].gsi != IRQ_MSI_EMU) { + return -EINVAL; + } + + /* Remember that KVM tried to translate this. It might need to try again. */ + s->pirq[pirq].is_translated = true; + + QEMU_LOCK_GUARD(&s->port_lock); + + port = s->pirq[pirq].port; + if (!valid_port(port)) { + return -EINVAL; + } + + cpu = qemu_get_cpu(s->port_table[port].vcpu); + if (!cpu) { + return -EINVAL; + } + + route->type = KVM_IRQ_ROUTING_XEN_EVTCHN; + route->u.xen_evtchn.port = port; + route->u.xen_evtchn.vcpu = kvm_arch_vcpu_id(cpu); + route->u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + return 0; /* Handled */ +} + +bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data) +{ + XenEvtchnState *s = xen_evtchn_singleton; + uint32_t pirq, port; + + if (!s) { + return false; + } + + assert(qemu_mutex_iothread_locked()); + + pirq = msi_pirq_target(address, data); + if (!pirq || pirq >= s->nr_pirqs) { + return false; + } + + QEMU_LOCK_GUARD(&s->port_lock); + + port = s->pirq[pirq].port; + if (!valid_port(port)) { + return false; + } + + set_port_pending(s, port); + return true; +} + +int xen_physdev_map_pirq(struct physdev_map_pirq *map) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int pirq = map->pirq; + int gsi = map->index; + + if (!s) { + return -ENOTSUP; + } + + QEMU_IOTHREAD_LOCK_GUARD(); + QEMU_LOCK_GUARD(&s->port_lock); + + if (map->domid != DOMID_SELF && map->domid != xen_domid) { + return -EPERM; + } + if (map->type != MAP_PIRQ_TYPE_GSI) { + return -EINVAL; + } + if (gsi < 0 || gsi >= IOAPIC_NUM_PINS) { + return -EINVAL; + } + + if (pirq < 0) { + pirq = allocate_pirq(s, map->type, gsi); + if (pirq < 0) { + return pirq; + } + map->pirq = pirq; + } else if (pirq > s->nr_pirqs) { + return -EINVAL; + } else { + /* + * User specified a valid-looking PIRQ#. Allow it if it is + * allocated and not yet bound, or if it is unallocated + */ + if (pirq_inuse(s, pirq)) { + if (s->pirq[pirq].gsi != IRQ_UNBOUND) { + return -EBUSY; + } + } else { + /* If it was unused, mark it used now. */ + pirq_inuse_word(s, pirq) |= pirq_inuse_bit(pirq); + } + /* Set the mapping in both directions. */ + s->pirq[pirq].gsi = gsi; + s->gsi_pirq[gsi] = pirq; + } + + trace_kvm_xen_map_pirq(pirq, gsi); + return 0; +} + +int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int pirq = unmap->pirq; + int gsi; + + if (!s) { + return -ENOTSUP; + } + + if (unmap->domid != DOMID_SELF && unmap->domid != xen_domid) { + return -EPERM; + } + if (pirq < 0 || pirq >= s->nr_pirqs) { + return -EINVAL; + } + + QEMU_IOTHREAD_LOCK_GUARD(); + qemu_mutex_lock(&s->port_lock); + + if (!pirq_inuse(s, pirq)) { + qemu_mutex_unlock(&s->port_lock); + return -ENOENT; + } + + gsi = s->pirq[pirq].gsi; + + /* We can only unmap GSI PIRQs */ + if (gsi < 0) { + qemu_mutex_unlock(&s->port_lock); + return -EINVAL; + } + + s->gsi_pirq[gsi] = 0; + s->pirq[pirq].gsi = IRQ_UNBOUND; /* Doesn't actually matter because: */ + pirq_inuse_word(s, pirq) &= ~pirq_inuse_bit(pirq); + + trace_kvm_xen_unmap_pirq(pirq, gsi); + qemu_mutex_unlock(&s->port_lock); + + if (gsi == IRQ_MSI_EMU) { + kvm_update_msi_routes_all(NULL, true, 0, 0); + } + + return 0; +} + +int xen_physdev_eoi_pirq(struct physdev_eoi *eoi) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int pirq = eoi->irq; + int gsi; + + if (!s) { + return -ENOTSUP; + } + + QEMU_IOTHREAD_LOCK_GUARD(); + QEMU_LOCK_GUARD(&s->port_lock); + + if (!pirq_inuse(s, pirq)) { + return -ENOENT; + } + + gsi = s->pirq[pirq].gsi; + if (gsi < 0) { + return -EINVAL; + } + + /* Reassert a level IRQ if needed */ + if (s->pirq_gsi_set & (1U << gsi)) { + int port = s->pirq[pirq].port; + if (port) { + set_port_pending(s, port); + } + } + + return 0; +} + +int xen_physdev_query_pirq(struct physdev_irq_status_query *query) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int pirq = query->irq; + + if (!s) { + return -ENOTSUP; + } + + QEMU_IOTHREAD_LOCK_GUARD(); + QEMU_LOCK_GUARD(&s->port_lock); + + if (!pirq_inuse(s, pirq)) { + return -ENOENT; + } + + if (s->pirq[pirq].gsi >= 0) { + query->flags = XENIRQSTAT_needs_eoi; + } else { + query->flags = 0; + } + + return 0; +} + +int xen_physdev_get_free_pirq(struct physdev_get_free_pirq *get) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int pirq; + + if (!s) { + return -ENOTSUP; + } + + QEMU_LOCK_GUARD(&s->port_lock); + + pirq = allocate_pirq(s, get->type, IRQ_UNBOUND); + if (pirq < 0) { + return pirq; + } + + get->pirq = pirq; + trace_kvm_xen_get_free_pirq(pirq, get->type); + return 0; +} + +struct xenevtchn_handle *xen_be_evtchn_open(void) +{ + struct xenevtchn_handle *xc = g_new0(struct xenevtchn_handle, 1); + + xc->fd = eventfd(0, EFD_CLOEXEC); + if (xc->fd < 0) { + free(xc); + return NULL; + } + + return xc; +} + +static int find_be_port(XenEvtchnState *s, struct xenevtchn_handle *xc) +{ + int i; + + for (i = 1; i < EVTCHN_2L_NR_CHANNELS; i++) { + if (!s->be_handles[i]) { + s->be_handles[i] = xc; + xc->be_port = i; + return i; + } + } + return 0; +} + +int xen_be_evtchn_bind_interdomain(struct xenevtchn_handle *xc, uint32_t domid, + evtchn_port_t guest_port) +{ + XenEvtchnState *s = xen_evtchn_singleton; + XenEvtchnPort *gp; + uint16_t be_port = 0; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (!xc) { + return -EFAULT; + } + + if (domid != xen_domid) { + return -ESRCH; + } + + if (!valid_port(guest_port)) { + return -EINVAL; + } + + qemu_mutex_lock(&s->port_lock); + + /* The guest has to have an unbound port waiting for us to bind */ + gp = &s->port_table[guest_port]; + + switch (gp->type) { + case EVTCHNSTAT_interdomain: + /* Allow rebinding after migration, preserve port # if possible */ + be_port = gp->type_val & ~PORT_INFO_TYPEVAL_REMOTE_QEMU; + assert(be_port != 0); + if (!s->be_handles[be_port]) { + s->be_handles[be_port] = xc; + xc->guest_port = guest_port; + ret = xc->be_port = be_port; + if (kvm_xen_has_cap(EVTCHN_SEND)) { + assign_kernel_eventfd(gp->type, guest_port, xc->fd); + } + break; + } + /* fall through */ + + case EVTCHNSTAT_unbound: + be_port = find_be_port(s, xc); + if (!be_port) { + ret = -ENOSPC; + goto out; + } + + gp->type = EVTCHNSTAT_interdomain; + gp->type_val = be_port | PORT_INFO_TYPEVAL_REMOTE_QEMU; + xc->guest_port = guest_port; + if (kvm_xen_has_cap(EVTCHN_SEND)) { + assign_kernel_eventfd(gp->type, guest_port, xc->fd); + } + ret = be_port; + break; + + default: + ret = -EINVAL; + break; + } + + out: + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +int xen_be_evtchn_unbind(struct xenevtchn_handle *xc, evtchn_port_t port) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (!xc) { + return -EFAULT; + } + + qemu_mutex_lock(&s->port_lock); + + if (port && port != xc->be_port) { + ret = -EINVAL; + goto out; + } + + if (xc->guest_port) { + XenEvtchnPort *gp = &s->port_table[xc->guest_port]; + + /* This should never *not* be true */ + if (gp->type == EVTCHNSTAT_interdomain) { + gp->type = EVTCHNSTAT_unbound; + gp->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU; + } + + if (kvm_xen_has_cap(EVTCHN_SEND)) { + deassign_kernel_port(xc->guest_port); + } + xc->guest_port = 0; + } + + s->be_handles[xc->be_port] = NULL; + xc->be_port = 0; + ret = 0; + out: + qemu_mutex_unlock(&s->port_lock); + return ret; +} + +int xen_be_evtchn_close(struct xenevtchn_handle *xc) +{ + if (!xc) { + return -EFAULT; + } + + xen_be_evtchn_unbind(xc, 0); + + close(xc->fd); + free(xc); + return 0; +} + +int xen_be_evtchn_fd(struct xenevtchn_handle *xc) +{ + if (!xc) { + return -1; + } + return xc->fd; +} + +int xen_be_evtchn_notify(struct xenevtchn_handle *xc, evtchn_port_t port) +{ + XenEvtchnState *s = xen_evtchn_singleton; + int ret; + + if (!s) { + return -ENOTSUP; + } + + if (!xc) { + return -EFAULT; + } + + qemu_mutex_lock(&s->port_lock); + + if (xc->guest_port) { + set_port_pending(s, xc->guest_port); + ret = 0; + } else { + ret = -ENOTCONN; + } + + qemu_mutex_unlock(&s->port_lock); + + return ret; +} + +int xen_be_evtchn_pending(struct xenevtchn_handle *xc) +{ + uint64_t val; + + if (!xc) { + return -EFAULT; + } + + if (!xc->be_port) { + return 0; + } + + if (eventfd_read(xc->fd, &val)) { + return -errno; + } + + return val ? xc->be_port : 0; +} + +int xen_be_evtchn_unmask(struct xenevtchn_handle *xc, evtchn_port_t port) +{ + if (!xc) { + return -EFAULT; + } + + if (xc->be_port != port) { + return -EINVAL; + } + + /* + * We don't actually do anything to unmask it; the event was already + * consumed in xen_be_evtchn_pending(). + */ + return 0; +} + +int xen_be_evtchn_get_guest_port(struct xenevtchn_handle *xc) +{ + return xc->guest_port; +} + +EvtchnInfoList *qmp_xen_event_list(Error **errp) +{ + XenEvtchnState *s = xen_evtchn_singleton; + EvtchnInfoList *head = NULL, **tail = &head; + void *shinfo, *pending, *mask; + int i; + + if (!s) { + error_setg(errp, "Xen event channel emulation not enabled"); + return NULL; + } + + shinfo = xen_overlay_get_shinfo_ptr(); + if (!shinfo) { + error_setg(errp, "Xen shared info page not allocated"); + return NULL; + } + + if (xen_is_long_mode()) { + pending = shinfo + offsetof(struct shared_info, evtchn_pending); + mask = shinfo + offsetof(struct shared_info, evtchn_mask); + } else { + pending = shinfo + offsetof(struct compat_shared_info, evtchn_pending); + mask = shinfo + offsetof(struct compat_shared_info, evtchn_mask); + } + + QEMU_LOCK_GUARD(&s->port_lock); + + for (i = 0; i < s->nr_ports; i++) { + XenEvtchnPort *p = &s->port_table[i]; + EvtchnInfo *info; + + if (p->type == EVTCHNSTAT_closed) { + continue; + } + + info = g_new0(EvtchnInfo, 1); + + info->port = i; + qemu_build_assert(EVTCHN_PORT_TYPE_CLOSED == EVTCHNSTAT_closed); + qemu_build_assert(EVTCHN_PORT_TYPE_UNBOUND == EVTCHNSTAT_unbound); + qemu_build_assert(EVTCHN_PORT_TYPE_INTERDOMAIN == EVTCHNSTAT_interdomain); + qemu_build_assert(EVTCHN_PORT_TYPE_PIRQ == EVTCHNSTAT_pirq); + qemu_build_assert(EVTCHN_PORT_TYPE_VIRQ == EVTCHNSTAT_virq); + qemu_build_assert(EVTCHN_PORT_TYPE_IPI == EVTCHNSTAT_ipi); + + info->type = p->type; + if (p->type == EVTCHNSTAT_interdomain) { + info->remote_domain = g_strdup((p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) ? + "qemu" : "loopback"); + info->target = p->type_val & PORT_INFO_TYPEVAL_REMOTE_PORT_MASK; + } else { + info->target = p->type_val; + } + info->vcpu = p->vcpu; + info->pending = test_bit(i, pending); + info->masked = test_bit(i, mask); + + QAPI_LIST_APPEND(tail, info); + } + + return head; +} + +void qmp_xen_event_inject(uint32_t port, Error **errp) +{ + XenEvtchnState *s = xen_evtchn_singleton; + + if (!s) { + error_setg(errp, "Xen event channel emulation not enabled"); + return; + } + + if (!valid_port(port)) { + error_setg(errp, "Invalid port %u", port); + } + + QEMU_LOCK_GUARD(&s->port_lock); + + if (set_port_pending(s, port)) { + error_setg(errp, "Failed to set port %u", port); + return; + } +} + +void hmp_xen_event_list(Monitor *mon, const QDict *qdict) +{ + EvtchnInfoList *iter, *info_list; + Error *err = NULL; + + info_list = qmp_xen_event_list(&err); + if (err) { + hmp_handle_error(mon, err); + return; + } + + for (iter = info_list; iter; iter = iter->next) { + EvtchnInfo *info = iter->value; + + monitor_printf(mon, "port %4u: vcpu: %d %s", info->port, info->vcpu, + EvtchnPortType_str(info->type)); + if (info->type != EVTCHN_PORT_TYPE_IPI) { + monitor_printf(mon, "("); + if (info->remote_domain) { + monitor_printf(mon, "%s:", info->remote_domain); + } + monitor_printf(mon, "%d)", info->target); + } + if (info->pending) { + monitor_printf(mon, " PENDING"); + } + if (info->masked) { + monitor_printf(mon, " MASKED"); + } + monitor_printf(mon, "\n"); + } + + qapi_free_EvtchnInfoList(info_list); +} + +void hmp_xen_event_inject(Monitor *mon, const QDict *qdict) +{ + int port = qdict_get_int(qdict, "port"); + Error *err = NULL; + + qmp_xen_event_inject(port, &err); + if (err) { + hmp_handle_error(mon, err); + } else { + monitor_printf(mon, "Delivered port %d\n", port); + } +} + diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h new file mode 100644 index 0000000..bfb67ac --- /dev/null +++ b/hw/i386/kvm/xen_evtchn.h @@ -0,0 +1,88 @@ +/* + * QEMU Xen emulation: Event channel support + * + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_XEN_EVTCHN_H +#define QEMU_XEN_EVTCHN_H + +#include "hw/sysbus.h" + +typedef uint32_t evtchn_port_t; + +void xen_evtchn_create(void); +int xen_evtchn_soft_reset(void); +int xen_evtchn_set_callback_param(uint64_t param); +void xen_evtchn_connect_gsis(qemu_irq *system_gsis); +void xen_evtchn_set_callback_level(int level); + +int xen_evtchn_set_port(uint16_t port); + +bool xen_evtchn_set_gsi(int gsi, int level); +void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector, + uint64_t addr, uint32_t data, bool is_masked); +void xen_evtchn_remove_pci_device(PCIDevice *dev); +struct kvm_irq_routing_entry; +int xen_evtchn_translate_pirq_msi(struct kvm_irq_routing_entry *route, + uint64_t address, uint32_t data); +bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data); + + +/* + * These functions mirror the libxenevtchn library API, providing the QEMU + * backend side of "interdomain" event channels. + */ +struct xenevtchn_handle; +struct xenevtchn_handle *xen_be_evtchn_open(void); +int xen_be_evtchn_bind_interdomain(struct xenevtchn_handle *xc, uint32_t domid, + evtchn_port_t guest_port); +int xen_be_evtchn_unbind(struct xenevtchn_handle *xc, evtchn_port_t port); +int xen_be_evtchn_close(struct xenevtchn_handle *xc); +int xen_be_evtchn_fd(struct xenevtchn_handle *xc); +int xen_be_evtchn_notify(struct xenevtchn_handle *xc, evtchn_port_t port); +int xen_be_evtchn_unmask(struct xenevtchn_handle *xc, evtchn_port_t port); +int xen_be_evtchn_pending(struct xenevtchn_handle *xc); +/* Apart from this which is a local addition */ +int xen_be_evtchn_get_guest_port(struct xenevtchn_handle *xc); + +struct evtchn_status; +struct evtchn_close; +struct evtchn_unmask; +struct evtchn_bind_virq; +struct evtchn_bind_pirq; +struct evtchn_bind_ipi; +struct evtchn_send; +struct evtchn_alloc_unbound; +struct evtchn_bind_interdomain; +struct evtchn_bind_vcpu; +struct evtchn_reset; +int xen_evtchn_status_op(struct evtchn_status *status); +int xen_evtchn_close_op(struct evtchn_close *close); +int xen_evtchn_unmask_op(struct evtchn_unmask *unmask); +int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq); +int xen_evtchn_bind_pirq_op(struct evtchn_bind_pirq *pirq); +int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi); +int xen_evtchn_send_op(struct evtchn_send *send); +int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc); +int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain); +int xen_evtchn_bind_vcpu_op(struct evtchn_bind_vcpu *vcpu); +int xen_evtchn_reset_op(struct evtchn_reset *reset); + +struct physdev_map_pirq; +struct physdev_unmap_pirq; +struct physdev_eoi; +struct physdev_irq_status_query; +struct physdev_get_free_pirq; +int xen_physdev_map_pirq(struct physdev_map_pirq *map); +int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap); +int xen_physdev_eoi_pirq(struct physdev_eoi *eoi); +int xen_physdev_query_pirq(struct physdev_irq_status_query *query); +int xen_physdev_get_free_pirq(struct physdev_get_free_pirq *get); + +#endif /* QEMU_XEN_EVTCHN_H */ diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c new file mode 100644 index 0000000..1e691de --- /dev/null +++ b/hw/i386/kvm/xen_gnttab.c @@ -0,0 +1,232 @@ +/* + * QEMU Xen emulation: Grant table support + * + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include "qemu/module.h" +#include "qemu/lockable.h" +#include "qemu/main-loop.h" +#include "qapi/error.h" +#include "qom/object.h" +#include "exec/target_page.h" +#include "exec/address-spaces.h" +#include "migration/vmstate.h" + +#include "hw/sysbus.h" +#include "hw/xen/xen.h" +#include "xen_overlay.h" +#include "xen_gnttab.h" + +#include "sysemu/kvm.h" +#include "sysemu/kvm_xen.h" + +#include "hw/xen/interface/memory.h" +#include "hw/xen/interface/grant_table.h" + +#define TYPE_XEN_GNTTAB "xen-gnttab" +OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB) + +#define XEN_PAGE_SHIFT 12 +#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT) + +#define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t)) + +struct XenGnttabState { + /*< private >*/ + SysBusDevice busdev; + /*< public >*/ + + QemuMutex gnt_lock; + + uint32_t nr_frames; + uint32_t max_frames; + + union { + grant_entry_v1_t *v1; + /* Theoretically, v2 support could be added here. */ + } entries; + + MemoryRegion gnt_frames; + MemoryRegion *gnt_aliases; + uint64_t *gnt_frame_gpas; +}; + +struct XenGnttabState *xen_gnttab_singleton; + +static void xen_gnttab_realize(DeviceState *dev, Error **errp) +{ + XenGnttabState *s = XEN_GNTTAB(dev); + int i; + + if (xen_mode != XEN_EMULATE) { + error_setg(errp, "Xen grant table support is for Xen emulation"); + return; + } + s->nr_frames = 0; + s->max_frames = kvm_xen_get_gnttab_max_frames(); + memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table", + XEN_PAGE_SIZE * s->max_frames, &error_abort); + memory_region_set_enabled(&s->gnt_frames, true); + s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames); + memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames); + + /* Create individual page-sizes aliases for overlays */ + s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames); + s->gnt_frame_gpas = (void *)g_new(uint64_t, s->max_frames); + for (i = 0; i < s->max_frames; i++) { + memory_region_init_alias(&s->gnt_aliases[i], OBJECT(dev), + NULL, &s->gnt_frames, + i * XEN_PAGE_SIZE, XEN_PAGE_SIZE); + s->gnt_frame_gpas[i] = INVALID_GPA; + } + + qemu_mutex_init(&s->gnt_lock); + + xen_gnttab_singleton = s; +} + +static int xen_gnttab_post_load(void *opaque, int version_id) +{ + XenGnttabState *s = XEN_GNTTAB(opaque); + uint32_t i; + + for (i = 0; i < s->nr_frames; i++) { + if (s->gnt_frame_gpas[i] != INVALID_GPA) { + xen_overlay_do_map_page(&s->gnt_aliases[i], s->gnt_frame_gpas[i]); + } + } + return 0; +} + +static bool xen_gnttab_is_needed(void *opaque) +{ + return xen_mode == XEN_EMULATE; +} + +static const VMStateDescription xen_gnttab_vmstate = { + .name = "xen_gnttab", + .version_id = 1, + .minimum_version_id = 1, + .needed = xen_gnttab_is_needed, + .post_load = xen_gnttab_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT32(nr_frames, XenGnttabState), + VMSTATE_VARRAY_UINT32(gnt_frame_gpas, XenGnttabState, nr_frames, 0, + vmstate_info_uint64, uint64_t), + VMSTATE_END_OF_LIST() + } +}; + +static void xen_gnttab_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = xen_gnttab_realize; + dc->vmsd = &xen_gnttab_vmstate; +} + +static const TypeInfo xen_gnttab_info = { + .name = TYPE_XEN_GNTTAB, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(XenGnttabState), + .class_init = xen_gnttab_class_init, +}; + +void xen_gnttab_create(void) +{ + xen_gnttab_singleton = XEN_GNTTAB(sysbus_create_simple(TYPE_XEN_GNTTAB, + -1, NULL)); +} + +static void xen_gnttab_register_types(void) +{ + type_register_static(&xen_gnttab_info); +} + +type_init(xen_gnttab_register_types) + +int xen_gnttab_map_page(uint64_t idx, uint64_t gfn) +{ + XenGnttabState *s = xen_gnttab_singleton; + uint64_t gpa = gfn << XEN_PAGE_SHIFT; + + if (!s) { + return -ENOTSUP; + } + + if (idx >= s->max_frames) { + return -EINVAL; + } + + QEMU_IOTHREAD_LOCK_GUARD(); + QEMU_LOCK_GUARD(&s->gnt_lock); + + xen_overlay_do_map_page(&s->gnt_aliases[idx], gpa); + + s->gnt_frame_gpas[idx] = gpa; + + if (s->nr_frames <= idx) { + s->nr_frames = idx + 1; + } + + return 0; +} + +int xen_gnttab_set_version_op(struct gnttab_set_version *set) +{ + int ret; + + switch (set->version) { + case 1: + ret = 0; + break; + + case 2: + /* Behave as before set_version was introduced. */ + ret = -ENOSYS; + break; + + default: + ret = -EINVAL; + } + + set->version = 1; + return ret; +} + +int xen_gnttab_get_version_op(struct gnttab_get_version *get) +{ + if (get->dom != DOMID_SELF && get->dom != xen_domid) { + return -ESRCH; + } + + get->version = 1; + return 0; +} + +int xen_gnttab_query_size_op(struct gnttab_query_size *size) +{ + XenGnttabState *s = xen_gnttab_singleton; + + if (!s) { + return -ENOTSUP; + } + + if (size->dom != DOMID_SELF && size->dom != xen_domid) { + size->status = GNTST_bad_domain; + return 0; + } + + size->status = GNTST_okay; + size->nr_frames = s->nr_frames; + size->max_nr_frames = s->max_frames; + return 0; +} diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h new file mode 100644 index 0000000..3bdbe96 --- /dev/null +++ b/hw/i386/kvm/xen_gnttab.h @@ -0,0 +1,25 @@ +/* + * QEMU Xen emulation: Grant table support + * + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_XEN_GNTTAB_H +#define QEMU_XEN_GNTTAB_H + +void xen_gnttab_create(void); +int xen_gnttab_map_page(uint64_t idx, uint64_t gfn); + +struct gnttab_set_version; +struct gnttab_get_version; +struct gnttab_query_size; +int xen_gnttab_set_version_op(struct gnttab_set_version *set); +int xen_gnttab_get_version_op(struct gnttab_get_version *get); +int xen_gnttab_query_size_op(struct gnttab_query_size *size); + +#endif /* QEMU_XEN_GNTTAB_H */ diff --git a/hw/i386/kvm/xen_overlay.c b/hw/i386/kvm/xen_overlay.c new file mode 100644 index 0000000..39fda1b --- /dev/null +++ b/hw/i386/kvm/xen_overlay.c @@ -0,0 +1,272 @@ +/* + * QEMU Xen emulation: Shared/overlay pages support + * + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include "qemu/module.h" +#include "qemu/main-loop.h" +#include "qapi/error.h" +#include "qom/object.h" +#include "exec/target_page.h" +#include "exec/address-spaces.h" +#include "migration/vmstate.h" + +#include "hw/sysbus.h" +#include "hw/xen/xen.h" +#include "xen_overlay.h" + +#include "sysemu/kvm.h" +#include "sysemu/kvm_xen.h" +#include <linux/kvm.h> + +#include "hw/xen/interface/memory.h" + + +#define TYPE_XEN_OVERLAY "xen-overlay" +OBJECT_DECLARE_SIMPLE_TYPE(XenOverlayState, XEN_OVERLAY) + +#define XEN_PAGE_SHIFT 12 +#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT) + +struct XenOverlayState { + /*< private >*/ + SysBusDevice busdev; + /*< public >*/ + + MemoryRegion shinfo_mem; + void *shinfo_ptr; + uint64_t shinfo_gpa; + bool long_mode; +}; + +struct XenOverlayState *xen_overlay_singleton; + +void xen_overlay_do_map_page(MemoryRegion *page, uint64_t gpa) +{ + /* + * Xen allows guests to map the same page as many times as it likes + * into guest physical frames. We don't, because it would be hard + * to track and restore them all. One mapping of each page is + * perfectly sufficient for all known guests... and we've tested + * that theory on a few now in other implementations. dwmw2. + */ + if (memory_region_is_mapped(page)) { + if (gpa == INVALID_GPA) { + memory_region_del_subregion(get_system_memory(), page); + } else { + /* Just move it */ + memory_region_set_address(page, gpa); + } + } else if (gpa != INVALID_GPA) { + memory_region_add_subregion_overlap(get_system_memory(), gpa, page, 0); + } +} + +/* KVM is the only existing back end for now. Let's not overengineer it yet. */ +static int xen_overlay_set_be_shinfo(uint64_t gfn) +{ + struct kvm_xen_hvm_attr xa = { + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, + .u.shared_info.gfn = gfn, + }; + + return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa); +} + + +static void xen_overlay_realize(DeviceState *dev, Error **errp) +{ + XenOverlayState *s = XEN_OVERLAY(dev); + + if (xen_mode != XEN_EMULATE) { + error_setg(errp, "Xen overlay page support is for Xen emulation"); + return; + } + + memory_region_init_ram(&s->shinfo_mem, OBJECT(dev), "xen:shared_info", + XEN_PAGE_SIZE, &error_abort); + memory_region_set_enabled(&s->shinfo_mem, true); + + s->shinfo_ptr = memory_region_get_ram_ptr(&s->shinfo_mem); + s->shinfo_gpa = INVALID_GPA; + s->long_mode = false; + memset(s->shinfo_ptr, 0, XEN_PAGE_SIZE); +} + +static int xen_overlay_pre_save(void *opaque) +{ + /* + * Fetch the kernel's idea of long_mode to avoid the race condition + * where the guest has set the hypercall page up in 64-bit mode but + * not yet made a hypercall by the time migration happens, so qemu + * hasn't yet noticed. + */ + return xen_sync_long_mode(); +} + +static int xen_overlay_post_load(void *opaque, int version_id) +{ + XenOverlayState *s = opaque; + + if (s->shinfo_gpa != INVALID_GPA) { + xen_overlay_do_map_page(&s->shinfo_mem, s->shinfo_gpa); + xen_overlay_set_be_shinfo(s->shinfo_gpa >> XEN_PAGE_SHIFT); + } + if (s->long_mode) { + xen_set_long_mode(true); + } + + return 0; +} + +static bool xen_overlay_is_needed(void *opaque) +{ + return xen_mode == XEN_EMULATE; +} + +static const VMStateDescription xen_overlay_vmstate = { + .name = "xen_overlay", + .version_id = 1, + .minimum_version_id = 1, + .needed = xen_overlay_is_needed, + .pre_save = xen_overlay_pre_save, + .post_load = xen_overlay_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT64(shinfo_gpa, XenOverlayState), + VMSTATE_BOOL(long_mode, XenOverlayState), + VMSTATE_END_OF_LIST() + } +}; + +static void xen_overlay_reset(DeviceState *dev) +{ + kvm_xen_soft_reset(); +} + +static void xen_overlay_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->reset = xen_overlay_reset; + dc->realize = xen_overlay_realize; + dc->vmsd = &xen_overlay_vmstate; +} + +static const TypeInfo xen_overlay_info = { + .name = TYPE_XEN_OVERLAY, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(XenOverlayState), + .class_init = xen_overlay_class_init, +}; + +void xen_overlay_create(void) +{ + xen_overlay_singleton = XEN_OVERLAY(sysbus_create_simple(TYPE_XEN_OVERLAY, + -1, NULL)); + + /* If xen_domid wasn't explicitly set, at least make sure it isn't zero. */ + if (xen_domid == DOMID_QEMU) { + xen_domid = 1; + }; +} + +static void xen_overlay_register_types(void) +{ + type_register_static(&xen_overlay_info); +} + +type_init(xen_overlay_register_types) + +int xen_overlay_map_shinfo_page(uint64_t gpa) +{ + XenOverlayState *s = xen_overlay_singleton; + int ret; + + if (!s) { + return -ENOENT; + } + + assert(qemu_mutex_iothread_locked()); + + if (s->shinfo_gpa) { + /* If removing shinfo page, turn the kernel magic off first */ + ret = xen_overlay_set_be_shinfo(INVALID_GFN); + if (ret) { + return ret; + } + } + + xen_overlay_do_map_page(&s->shinfo_mem, gpa); + if (gpa != INVALID_GPA) { + ret = xen_overlay_set_be_shinfo(gpa >> XEN_PAGE_SHIFT); + if (ret) { + return ret; + } + } + s->shinfo_gpa = gpa; + + return 0; +} + +void *xen_overlay_get_shinfo_ptr(void) +{ + XenOverlayState *s = xen_overlay_singleton; + + if (!s) { + return NULL; + } + + return s->shinfo_ptr; +} + +int xen_sync_long_mode(void) +{ + int ret; + struct kvm_xen_hvm_attr xa = { + .type = KVM_XEN_ATTR_TYPE_LONG_MODE, + }; + + if (!xen_overlay_singleton) { + return -ENOENT; + } + + ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_GET_ATTR, &xa); + if (!ret) { + xen_overlay_singleton->long_mode = xa.u.long_mode; + } + + return ret; +} + +int xen_set_long_mode(bool long_mode) +{ + int ret; + struct kvm_xen_hvm_attr xa = { + .type = KVM_XEN_ATTR_TYPE_LONG_MODE, + .u.long_mode = long_mode, + }; + + if (!xen_overlay_singleton) { + return -ENOENT; + } + + ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa); + if (!ret) { + xen_overlay_singleton->long_mode = xa.u.long_mode; + } + + return ret; +} + +bool xen_is_long_mode(void) +{ + return xen_overlay_singleton && xen_overlay_singleton->long_mode; +} diff --git a/hw/i386/kvm/xen_overlay.h b/hw/i386/kvm/xen_overlay.h new file mode 100644 index 0000000..75ecb6b --- /dev/null +++ b/hw/i386/kvm/xen_overlay.h @@ -0,0 +1,26 @@ +/* + * QEMU Xen emulation: Shared/overlay pages support + * + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_XEN_OVERLAY_H +#define QEMU_XEN_OVERLAY_H + +void xen_overlay_create(void); + +int xen_overlay_map_shinfo_page(uint64_t gpa); +void *xen_overlay_get_shinfo_ptr(void); + +int xen_sync_long_mode(void); +int xen_set_long_mode(bool long_mode); +bool xen_is_long_mode(void); + +void xen_overlay_do_map_page(MemoryRegion *page, uint64_t gpa); + +#endif /* QEMU_XEN_OVERLAY_H */ diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c new file mode 100644 index 0000000..14193ef --- /dev/null +++ b/hw/i386/kvm/xen_xenstore.c @@ -0,0 +1,500 @@ +/* + * QEMU Xen emulation: Shared/overlay pages support + * + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" + +#include "qemu/host-utils.h" +#include "qemu/module.h" +#include "qemu/main-loop.h" +#include "qemu/cutils.h" +#include "qapi/error.h" +#include "qom/object.h" +#include "migration/vmstate.h" + +#include "hw/sysbus.h" +#include "hw/xen/xen.h" +#include "xen_overlay.h" +#include "xen_evtchn.h" +#include "xen_xenstore.h" + +#include "sysemu/kvm.h" +#include "sysemu/kvm_xen.h" + +#include "hw/xen/interface/io/xs_wire.h" +#include "hw/xen/interface/event_channel.h" + +#define TYPE_XEN_XENSTORE "xen-xenstore" +OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE) + +#define XEN_PAGE_SHIFT 12 +#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT) + +#define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t)) +#define ENTRIES_PER_FRAME_V2 (XEN_PAGE_SIZE / sizeof(grant_entry_v2_t)) + +#define XENSTORE_HEADER_SIZE ((unsigned int)sizeof(struct xsd_sockmsg)) + +struct XenXenstoreState { + /*< private >*/ + SysBusDevice busdev; + /*< public >*/ + + MemoryRegion xenstore_page; + struct xenstore_domain_interface *xs; + uint8_t req_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX]; + uint8_t rsp_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX]; + uint32_t req_offset; + uint32_t rsp_offset; + bool rsp_pending; + bool fatal_error; + + evtchn_port_t guest_port; + evtchn_port_t be_port; + struct xenevtchn_handle *eh; +}; + +struct XenXenstoreState *xen_xenstore_singleton; + +static void xen_xenstore_event(void *opaque); + +static void xen_xenstore_realize(DeviceState *dev, Error **errp) +{ + XenXenstoreState *s = XEN_XENSTORE(dev); + + if (xen_mode != XEN_EMULATE) { + error_setg(errp, "Xen xenstore support is for Xen emulation"); + return; + } + memory_region_init_ram(&s->xenstore_page, OBJECT(dev), "xen:xenstore_page", + XEN_PAGE_SIZE, &error_abort); + memory_region_set_enabled(&s->xenstore_page, true); + s->xs = memory_region_get_ram_ptr(&s->xenstore_page); + memset(s->xs, 0, XEN_PAGE_SIZE); + + /* We can't map it this early as KVM isn't ready */ + xen_xenstore_singleton = s; + + s->eh = xen_be_evtchn_open(); + if (!s->eh) { + error_setg(errp, "Xenstore evtchn port init failed"); + return; + } + aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true, + xen_xenstore_event, NULL, NULL, NULL, s); +} + +static bool xen_xenstore_is_needed(void *opaque) +{ + return xen_mode == XEN_EMULATE; +} + +static int xen_xenstore_pre_save(void *opaque) +{ + XenXenstoreState *s = opaque; + + if (s->eh) { + s->guest_port = xen_be_evtchn_get_guest_port(s->eh); + } + return 0; +} + +static int xen_xenstore_post_load(void *opaque, int ver) +{ + XenXenstoreState *s = opaque; + + /* + * As qemu/dom0, rebind to the guest's port. The Windows drivers may + * unbind the XenStore evtchn and rebind to it, having obtained the + * "remote" port through EVTCHNOP_status. In the case that migration + * occurs while it's unbound, the "remote" port needs to be the same + * as before so that the guest can find it, but should remain unbound. + */ + if (s->guest_port) { + int be_port = xen_be_evtchn_bind_interdomain(s->eh, xen_domid, + s->guest_port); + if (be_port < 0) { + return be_port; + } + s->be_port = be_port; + } + return 0; +} + +static const VMStateDescription xen_xenstore_vmstate = { + .name = "xen_xenstore", + .version_id = 1, + .minimum_version_id = 1, + .needed = xen_xenstore_is_needed, + .pre_save = xen_xenstore_pre_save, + .post_load = xen_xenstore_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT8_ARRAY(req_data, XenXenstoreState, + sizeof_field(XenXenstoreState, req_data)), + VMSTATE_UINT8_ARRAY(rsp_data, XenXenstoreState, + sizeof_field(XenXenstoreState, rsp_data)), + VMSTATE_UINT32(req_offset, XenXenstoreState), + VMSTATE_UINT32(rsp_offset, XenXenstoreState), + VMSTATE_BOOL(rsp_pending, XenXenstoreState), + VMSTATE_UINT32(guest_port, XenXenstoreState), + VMSTATE_BOOL(fatal_error, XenXenstoreState), + VMSTATE_END_OF_LIST() + } +}; + +static void xen_xenstore_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = xen_xenstore_realize; + dc->vmsd = &xen_xenstore_vmstate; +} + +static const TypeInfo xen_xenstore_info = { + .name = TYPE_XEN_XENSTORE, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(XenXenstoreState), + .class_init = xen_xenstore_class_init, +}; + +void xen_xenstore_create(void) +{ + DeviceState *dev = sysbus_create_simple(TYPE_XEN_XENSTORE, -1, NULL); + + xen_xenstore_singleton = XEN_XENSTORE(dev); + + /* + * Defer the init (xen_xenstore_reset()) until KVM is set up and the + * overlay page can be mapped. + */ +} + +static void xen_xenstore_register_types(void) +{ + type_register_static(&xen_xenstore_info); +} + +type_init(xen_xenstore_register_types) + +uint16_t xen_xenstore_get_port(void) +{ + XenXenstoreState *s = xen_xenstore_singleton; + if (!s) { + return 0; + } + return s->guest_port; +} + +static bool req_pending(XenXenstoreState *s) +{ + struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data; + + return s->req_offset == XENSTORE_HEADER_SIZE + req->len; +} + +static void reset_req(XenXenstoreState *s) +{ + memset(s->req_data, 0, sizeof(s->req_data)); + s->req_offset = 0; +} + +static void reset_rsp(XenXenstoreState *s) +{ + s->rsp_pending = false; + + memset(s->rsp_data, 0, sizeof(s->rsp_data)); + s->rsp_offset = 0; +} + +static void process_req(XenXenstoreState *s) +{ + struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data; + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + const char enosys[] = "ENOSYS"; + + assert(req_pending(s)); + assert(!s->rsp_pending); + + rsp->type = XS_ERROR; + rsp->req_id = req->req_id; + rsp->tx_id = req->tx_id; + rsp->len = sizeof(enosys); + memcpy((void *)&rsp[1], enosys, sizeof(enosys)); + + s->rsp_pending = true; + reset_req(s); +} + +static unsigned int copy_from_ring(XenXenstoreState *s, uint8_t *ptr, + unsigned int len) +{ + if (!len) { + return 0; + } + + XENSTORE_RING_IDX prod = qatomic_read(&s->xs->req_prod); + XENSTORE_RING_IDX cons = qatomic_read(&s->xs->req_cons); + unsigned int copied = 0; + + /* Ensure the ring contents don't cross the req_prod access. */ + smp_rmb(); + + while (len) { + unsigned int avail = prod - cons; + unsigned int offset = MASK_XENSTORE_IDX(cons); + unsigned int copylen = avail; + + if (avail > XENSTORE_RING_SIZE) { + error_report("XenStore ring handling error"); + s->fatal_error = true; + break; + } else if (avail == 0) { + break; + } + + if (copylen > len) { + copylen = len; + } + if (copylen > XENSTORE_RING_SIZE - offset) { + copylen = XENSTORE_RING_SIZE - offset; + } + + memcpy(ptr, &s->xs->req[offset], copylen); + copied += copylen; + + ptr += copylen; + len -= copylen; + + cons += copylen; + } + + /* + * Not sure this ever mattered except on Alpha, but this barrier + * is to ensure that the update to req_cons is globally visible + * only after we have consumed all the data from the ring, and we + * don't end up seeing data written to the ring *after* the other + * end sees the update and writes more to the ring. Xen's own + * xenstored has the same barrier here (although with no comment + * at all, obviously, because it's Xen code). + */ + smp_mb(); + + qatomic_set(&s->xs->req_cons, cons); + + return copied; +} + +static unsigned int copy_to_ring(XenXenstoreState *s, uint8_t *ptr, + unsigned int len) +{ + if (!len) { + return 0; + } + + XENSTORE_RING_IDX cons = qatomic_read(&s->xs->rsp_cons); + XENSTORE_RING_IDX prod = qatomic_read(&s->xs->rsp_prod); + unsigned int copied = 0; + + /* + * This matches the barrier in copy_to_ring() (or the guest's + * equivalent) betweem writing the data to the ring and updating + * rsp_prod. It protects against the pathological case (which + * again I think never happened except on Alpha) where our + * subsequent writes to the ring could *cross* the read of + * rsp_cons and the guest could see the new data when it was + * intending to read the old. + */ + smp_mb(); + + while (len) { + unsigned int avail = cons + XENSTORE_RING_SIZE - prod; + unsigned int offset = MASK_XENSTORE_IDX(prod); + unsigned int copylen = len; + + if (avail > XENSTORE_RING_SIZE) { + error_report("XenStore ring handling error"); + s->fatal_error = true; + break; + } else if (avail == 0) { + break; + } + + if (copylen > avail) { + copylen = avail; + } + if (copylen > XENSTORE_RING_SIZE - offset) { + copylen = XENSTORE_RING_SIZE - offset; + } + + + memcpy(&s->xs->rsp[offset], ptr, copylen); + copied += copylen; + + ptr += copylen; + len -= copylen; + + prod += copylen; + } + + /* Ensure the ring contents are seen before rsp_prod update. */ + smp_wmb(); + + qatomic_set(&s->xs->rsp_prod, prod); + + return copied; +} + +static unsigned int get_req(XenXenstoreState *s) +{ + unsigned int copied = 0; + + if (s->fatal_error) { + return 0; + } + + assert(!req_pending(s)); + + if (s->req_offset < XENSTORE_HEADER_SIZE) { + void *ptr = s->req_data + s->req_offset; + unsigned int len = XENSTORE_HEADER_SIZE; + unsigned int copylen = copy_from_ring(s, ptr, len); + + copied += copylen; + s->req_offset += copylen; + } + + if (s->req_offset >= XENSTORE_HEADER_SIZE) { + struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data; + + if (req->len > (uint32_t)XENSTORE_PAYLOAD_MAX) { + error_report("Illegal XenStore request"); + s->fatal_error = true; + return 0; + } + + void *ptr = s->req_data + s->req_offset; + unsigned int len = XENSTORE_HEADER_SIZE + req->len - s->req_offset; + unsigned int copylen = copy_from_ring(s, ptr, len); + + copied += copylen; + s->req_offset += copylen; + } + + return copied; +} + +static unsigned int put_rsp(XenXenstoreState *s) +{ + if (s->fatal_error) { + return 0; + } + + assert(s->rsp_pending); + + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + assert(s->rsp_offset < XENSTORE_HEADER_SIZE + rsp->len); + + void *ptr = s->rsp_data + s->rsp_offset; + unsigned int len = XENSTORE_HEADER_SIZE + rsp->len - s->rsp_offset; + unsigned int copylen = copy_to_ring(s, ptr, len); + + s->rsp_offset += copylen; + + /* Have we produced a complete response? */ + if (s->rsp_offset == XENSTORE_HEADER_SIZE + rsp->len) { + reset_rsp(s); + } + + return copylen; +} + +static void xen_xenstore_event(void *opaque) +{ + XenXenstoreState *s = opaque; + evtchn_port_t port = xen_be_evtchn_pending(s->eh); + unsigned int copied_to, copied_from; + bool processed, notify = false; + + if (port != s->be_port) { + return; + } + + /* We know this is a no-op. */ + xen_be_evtchn_unmask(s->eh, port); + + do { + copied_to = copied_from = 0; + processed = false; + + if (s->rsp_pending) { + copied_to = put_rsp(s); + } + + if (!req_pending(s)) { + copied_from = get_req(s); + } + + if (req_pending(s) && !s->rsp_pending) { + process_req(s); + processed = true; + } + + notify |= copied_to || copied_from; + } while (copied_to || copied_from || processed); + + if (notify) { + xen_be_evtchn_notify(s->eh, s->be_port); + } +} + +static void alloc_guest_port(XenXenstoreState *s) +{ + struct evtchn_alloc_unbound alloc = { + .dom = DOMID_SELF, + .remote_dom = DOMID_QEMU, + }; + + if (!xen_evtchn_alloc_unbound_op(&alloc)) { + s->guest_port = alloc.port; + } +} + +int xen_xenstore_reset(void) +{ + XenXenstoreState *s = xen_xenstore_singleton; + int err; + + if (!s) { + return -ENOTSUP; + } + + s->req_offset = s->rsp_offset = 0; + s->rsp_pending = false; + + if (!memory_region_is_mapped(&s->xenstore_page)) { + uint64_t gpa = XEN_SPECIAL_PFN(XENSTORE) << TARGET_PAGE_BITS; + xen_overlay_do_map_page(&s->xenstore_page, gpa); + } + + alloc_guest_port(s); + + /* + * As qemu/dom0, bind to the guest's port. For incoming migration, this + * will be unbound as the guest's evtchn table is overwritten. We then + * rebind to the correct guest port in xen_xenstore_post_load(). + */ + err = xen_be_evtchn_bind_interdomain(s->eh, xen_domid, s->guest_port); + if (err < 0) { + return err; + } + s->be_port = err; + + return 0; +} diff --git a/hw/i386/kvm/xen_xenstore.h b/hw/i386/kvm/xen_xenstore.h new file mode 100644 index 0000000..8c3768e --- /dev/null +++ b/hw/i386/kvm/xen_xenstore.h @@ -0,0 +1,20 @@ +/* + * QEMU Xen emulation: Xenstore emulation + * + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_XEN_XENSTORE_H +#define QEMU_XEN_XENSTORE_H + +void xen_xenstore_create(void); +int xen_xenstore_reset(void); + +uint16_t xen_xenstore_get_port(void); + +#endif /* QEMU_XEN_XENSTORE_H */ diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 992951c..fd17ce7 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -90,6 +90,10 @@ #include "hw/virtio/virtio-iommu.h" #include "hw/virtio/virtio-pmem-pci.h" #include "hw/virtio/virtio-mem-pci.h" +#include "hw/i386/kvm/xen_overlay.h" +#include "hw/i386/kvm/xen_evtchn.h" +#include "hw/i386/kvm/xen_gnttab.h" +#include "hw/i386/kvm/xen_xenstore.h" #include "hw/mem/memory-device.h" #include "sysemu/replay.h" #include "target/i386/cpu.h" @@ -1308,6 +1312,15 @@ void pc_basic_device_init(struct PCMachineState *pcms, } *rtc_state = ISA_DEVICE(mc146818_rtc_init(isa_bus, 2000, rtc_irq)); +#ifdef CONFIG_XEN_EMU + if (xen_mode == XEN_EMULATE) { + xen_evtchn_connect_gsis(gsi); + if (pcms->bus) { + pci_create_simple(pcms->bus, -1, "xen-platform"); + } + } +#endif + qemu_register_boot_set(pc_boot_set, *rtc_state); if (!xen_enabled() && @@ -1846,6 +1859,19 @@ static void pc_machine_initfn(Object *obj) cxl_machine_init(obj, &pcms->cxl_devices_state); } +int pc_machine_kvm_type(MachineState *machine, const char *kvm_type) +{ +#ifdef CONFIG_XEN_EMU + if (xen_mode == XEN_EMULATE) { + xen_overlay_create(); + xen_evtchn_create(); + xen_gnttab_create(); + xen_xenstore_create(); + } +#endif + return 0; +} + static void pc_machine_reset(MachineState *machine, ShutdownCause reason) { CPUState *cs; diff --git a/hw/i386/x86.c b/hw/i386/x86.c index c44846f..a56b10b 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -61,6 +61,11 @@ #include CONFIG_DEVICES #include "kvm/kvm_i386.h" +#ifdef CONFIG_XEN_EMU +#include "hw/xen/xen.h" +#include "hw/i386/kvm/xen_evtchn.h" +#endif + /* Physical Address of PVH entry point read from kernel ELF NOTE */ static size_t pvh_start_addr; @@ -610,6 +615,17 @@ void gsi_handler(void *opaque, int n, int level) } /* fall through */ case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1: +#ifdef CONFIG_XEN_EMU + /* + * Xen delivers the GSI to the Legacy PIC (not that Legacy PIC + * routing actually works properly under Xen). And then to + * *either* the PIRQ handling or the I/OAPIC depending on + * whether the former wants it. + */ + if (xen_mode == XEN_EMULATE && xen_evtchn_set_gsi(n, level)) { + break; + } +#endif qemu_set_irq(s->ioapic_irq[n], level); break; case IO_APIC_SECONDARY_IRQBASE diff --git a/hw/i386/xen/meson.build b/hw/i386/xen/meson.build index be84130..2e64a34 100644 --- a/hw/i386/xen/meson.build +++ b/hw/i386/xen/meson.build @@ -2,6 +2,9 @@ i386_ss.add(when: 'CONFIG_XEN', if_true: files( 'xen-hvm.c', 'xen-mapcache.c', 'xen_apic.c', - 'xen_platform.c', 'xen_pvdevice.c', )) + +i386_ss.add(when: 'CONFIG_XEN_BUS', if_true: files( + 'xen_platform.c', +)) diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c index b9a6f7f..e5a1dd1 100644 --- a/hw/i386/xen/xen-hvm.c +++ b/hw/i386/xen/xen-hvm.c @@ -1502,13 +1502,7 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory) device_listener_register(&state->device_listener); xen_bus_init(); - - /* Initialize backend core & drivers */ - if (xen_be_init() != 0) { - error_report("xen backend core setup failed"); - goto err; - } - xen_be_register_common(); + xen_be_init(); QLIST_INIT(&xen_physmap); xen_read_physmap(state); diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c index 3795a20..539f7da 100644 --- a/hw/i386/xen/xen_platform.c +++ b/hw/i386/xen/xen_platform.c @@ -27,9 +27,9 @@ #include "qapi/error.h" #include "hw/ide/pci.h" #include "hw/pci/pci.h" -#include "hw/xen/xen_common.h" #include "migration/vmstate.h" -#include "hw/xen/xen-legacy-backend.h" +#include "hw/xen/xen.h" +#include "net/net.h" #include "trace.h" #include "sysemu/xen.h" #include "sysemu/block-backend.h" @@ -37,6 +37,11 @@ #include "qemu/module.h" #include "qom/object.h" +#ifdef CONFIG_XEN +#include "hw/xen/xen_common.h" +#include "hw/xen/xen-legacy-backend.h" +#endif + //#define DEBUG_PLATFORM #ifdef DEBUG_PLATFORM @@ -108,12 +113,25 @@ static void log_writeb(PCIXenPlatformState *s, char val) #define _UNPLUG_NVME_DISKS 3 #define UNPLUG_NVME_DISKS (1u << _UNPLUG_NVME_DISKS) +static bool pci_device_is_passthrough(PCIDevice *d) +{ + if (!strcmp(d->name, "xen-pci-passthrough")) { + return true; + } + + if (xen_mode == XEN_EMULATE && !strcmp(d->name, "vfio-pci")) { + return true; + } + + return false; +} + static void unplug_nic(PCIBus *b, PCIDevice *d, void *o) { /* We have to ignore passthrough devices */ if (pci_get_word(d->config + PCI_CLASS_DEVICE) == PCI_CLASS_NETWORK_ETHERNET - && strcmp(d->name, "xen-pci-passthrough") != 0) { + && !pci_device_is_passthrough(d)) { object_unparent(OBJECT(d)); } } @@ -186,9 +204,8 @@ static void unplug_disks(PCIBus *b, PCIDevice *d, void *opaque) !(flags & UNPLUG_IDE_SCSI_DISKS); /* We have to ignore passthrough devices */ - if (!strcmp(d->name, "xen-pci-passthrough")) { + if (pci_device_is_passthrough(d)) return; - } switch (pci_get_word(d->config + PCI_CLASS_DEVICE)) { case PCI_CLASS_STORAGE_IDE: @@ -267,18 +284,26 @@ static void platform_fixed_ioport_writeb(void *opaque, uint32_t addr, uint32_t v PCIXenPlatformState *s = opaque; switch (addr) { - case 0: /* Platform flags */ { - hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ? - HVMMEM_ram_ro : HVMMEM_ram_rw; - if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) { - DPRINTF("unable to change ro/rw state of ROM memory area!\n"); - } else { + case 0: /* Platform flags */ + if (xen_mode == XEN_EMULATE) { + /* XX: Use i440gx/q35 PAM setup to do this? */ s->flags = val & PFFLAG_ROM_LOCK; - DPRINTF("changed ro/rw state of ROM memory area. now is %s state.\n", - (mem_type == HVMMEM_ram_ro ? "ro":"rw")); +#ifdef CONFIG_XEN + } else { + hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ? + HVMMEM_ram_ro : HVMMEM_ram_rw; + + if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) { + DPRINTF("unable to change ro/rw state of ROM memory area!\n"); + } else { + s->flags = val & PFFLAG_ROM_LOCK; + DPRINTF("changed ro/rw state of ROM memory area. now is %s state.\n", + (mem_type == HVMMEM_ram_ro ? "ro" : "rw")); + } +#endif } break; - } + case 2: log_writeb(s, val); break; @@ -496,8 +521,8 @@ static void xen_platform_realize(PCIDevice *dev, Error **errp) uint8_t *pci_conf; /* Device will crash on reset if xen is not initialized */ - if (!xen_enabled()) { - error_setg(errp, "xen-platform device requires the Xen accelerator"); + if (xen_mode == XEN_DISABLED) { + error_setg(errp, "xen-platform device requires a Xen guest"); return; } diff --git a/hw/pci/msi.c b/hw/pci/msi.c index 1cadf15..041b0bd 100644 --- a/hw/pci/msi.c +++ b/hw/pci/msi.c @@ -24,6 +24,8 @@ #include "qemu/range.h" #include "qapi/error.h" +#include "hw/i386/kvm/xen_evtchn.h" + /* PCI_MSI_ADDRESS_LO */ #define PCI_MSI_ADDRESS_LO_MASK (~0x3) @@ -414,6 +416,15 @@ void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len) fprintf(stderr, "\n"); #endif + if (xen_mode == XEN_EMULATE) { + for (vector = 0; vector < msi_nr_vectors(flags); vector++) { + MSIMessage msg = msi_prepare_message(dev, vector); + + xen_evtchn_snoop_msi(dev, false, vector, msg.address, msg.data, + msi_is_masked(dev, vector)); + } + } + if (!(flags & PCI_MSI_FLAGS_ENABLE)) { return; } diff --git a/hw/pci/msix.c b/hw/pci/msix.c index 9e70fcd..ab8869d 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -26,6 +26,8 @@ #include "qapi/error.h" #include "trace.h" +#include "hw/i386/kvm/xen_evtchn.h" + /* MSI enable bit and maskall bit are in byte 1 in FLAGS register */ #define MSIX_CONTROL_OFFSET (PCI_MSIX_FLAGS + 1) #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8) @@ -124,6 +126,13 @@ static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked) { bool is_masked = msix_is_masked(dev, vector); + if (xen_mode == XEN_EMULATE) { + MSIMessage msg = msix_prepare_message(dev, vector); + + xen_evtchn_snoop_msi(dev, true, vector, msg.address, msg.data, + is_masked); + } + if (is_masked == was_masked) { return; } diff --git a/hw/pci/pci.c b/hw/pci/pci.c index bad8e63..10c980b 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -49,6 +49,9 @@ #include "qemu/cutils.h" #include "pci-internal.h" +#include "hw/xen/xen.h" +#include "hw/i386/kvm/xen_evtchn.h" + //#define DEBUG_PCI #ifdef DEBUG_PCI # define PCI_DPRINTF(format, ...) printf(format, ## __VA_ARGS__) @@ -319,6 +322,17 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg) { MemTxAttrs attrs = {}; + /* + * Xen uses the high bits of the address to contain some of the bits + * of the PIRQ#. Therefore we can't just send the write cycle and + * trust that it's caught by the APIC at 0xfee00000 because the + * target of the write might be e.g. 0x0x1000fee46000 for PIRQ#4166. + * So we intercept the delivery here instead of in kvm_send_msi(). + */ + if (xen_mode == XEN_EMULATE && + xen_evtchn_deliver_pirq_msi(msg.address, msg.data)) { + return; + } attrs.requester_id = pci_requester_id(dev); address_space_stl_le(&dev->bus_master_as, msg.address, msg.data, attrs, NULL); @@ -988,6 +1002,9 @@ static void do_pci_unregister_device(PCIDevice *pci_dev) pci_get_bus(pci_dev)->devices[pci_dev->devfn] = NULL; pci_config_free(pci_dev); + if (xen_mode == XEN_EMULATE) { + xen_evtchn_remove_pci_device(pci_dev); + } if (memory_region_is_mapped(&pci_dev->bus_master_enable_region)) { memory_region_del_subregion(&pci_dev->bus_master_container_region, &pci_dev->bus_master_enable_region); diff --git a/hw/xen/Kconfig b/hw/xen/Kconfig new file mode 100644 index 0000000..3467efb --- /dev/null +++ b/hw/xen/Kconfig @@ -0,0 +1,3 @@ +config XEN_BUS + bool + default y if (XEN || XEN_EMU) diff --git a/hw/xen/xen-legacy-backend.c b/hw/xen/xen-legacy-backend.c index 085fd31..afba71f 100644 --- a/hw/xen/xen-legacy-backend.c +++ b/hw/xen/xen-legacy-backend.c @@ -676,21 +676,30 @@ void xenstore_update_fe(char *watch, struct XenLegacyDevice *xendev) } /* -------------------------------------------------------------------- */ -int xen_be_init(void) +static void xen_set_dynamic_sysbus(void) +{ + Object *machine = qdev_get_machine(); + ObjectClass *oc = object_get_class(machine); + MachineClass *mc = MACHINE_CLASS(oc); + + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_XENSYSDEV); +} + +void xen_be_init(void) { xengnttab_handle *gnttabdev; xenstore = xs_daemon_open(); if (!xenstore) { xen_pv_printf(NULL, 0, "can't connect to xenstored\n"); - return -1; + exit(1); } qemu_set_fd_handler(xs_fileno(xenstore), xenstore_update, NULL, NULL); if (xen_xc == NULL || xen_fmem == NULL) { - /* Check if xen_init() have been called */ - goto err; + xen_pv_printf(NULL, 0, "Xen operations not set up\n"); + exit(1); } gnttabdev = xengnttab_open(NULL, 0); @@ -706,23 +715,16 @@ int xen_be_init(void) xen_sysbus = qbus_new(TYPE_XENSYSBUS, xen_sysdev, "xen-sysbus"); qbus_set_bus_hotplug_handler(xen_sysbus); - return 0; - -err: - qemu_set_fd_handler(xs_fileno(xenstore), NULL, NULL, NULL); - xs_daemon_close(xenstore); - xenstore = NULL; - - return -1; -} - -static void xen_set_dynamic_sysbus(void) -{ - Object *machine = qdev_get_machine(); - ObjectClass *oc = object_get_class(machine); - MachineClass *mc = MACHINE_CLASS(oc); + xen_set_dynamic_sysbus(); - machine_class_allow_dynamic_sysbus_dev(mc, TYPE_XENSYSDEV); + xen_be_register("console", &xen_console_ops); + xen_be_register("vkbd", &xen_kbdmouse_ops); +#ifdef CONFIG_VIRTFS + xen_be_register("9pfs", &xen_9pfs_ops); +#endif +#ifdef CONFIG_USB_LIBUSB + xen_be_register("qusb", &xen_usb_ops); +#endif } int xen_be_register(const char *type, struct XenDevOps *ops) @@ -744,20 +746,6 @@ int xen_be_register(const char *type, struct XenDevOps *ops) return xenstore_scan(type, xen_domid, ops); } -void xen_be_register_common(void) -{ - xen_set_dynamic_sysbus(); - - xen_be_register("console", &xen_console_ops); - xen_be_register("vkbd", &xen_kbdmouse_ops); -#ifdef CONFIG_VIRTFS - xen_be_register("9pfs", &xen_9pfs_ops); -#endif -#ifdef CONFIG_USB_LIBUSB - xen_be_register("qusb", &xen_usb_ops); -#endif -} - int xen_be_bind_evtchn(struct XenLegacyDevice *xendev) { if (xendev->local_port != -1) { diff --git a/hw/xenpv/xen_machine_pv.c b/hw/xenpv/xen_machine_pv.c index 20c9611..2e759d0 100644 --- a/hw/xenpv/xen_machine_pv.c +++ b/hw/xenpv/xen_machine_pv.c @@ -36,10 +36,7 @@ static void xen_init_pv(MachineState *machine) int i; /* Initialize backend core & drivers */ - if (xen_be_init() != 0) { - error_report("%s: xen backend core setup failed", __func__); - exit(1); - } + xen_be_init(); switch (xen_mode) { case XEN_ATTACH: @@ -55,7 +52,6 @@ static void xen_init_pv(MachineState *machine) break; } - xen_be_register_common(); xen_be_register("vfb", &xen_framebuffer_ops); xen_be_register("qnic", &xen_netdev_ops); |