From 820c1aba519bd072ac71c754733f6c86d8b4309f Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 6 Dec 2022 09:03:48 +0000
Subject: xen: add CONFIG_XEN_BUS and CONFIG_XEN_EMU options for Xen emulation

The XEN_EMU option will cover core Xen support in target/, which exists
only for x86 with KVM today but could theoretically also be implemented
on Arm/Aarch64 and with TCG or other accelerators (if anyone wants to
run the gauntlet of struct layout compatibility, errno mapping, and the
rest of that fui).

It will also cover the support for architecture-independent grant table
and event channel support which will be added in hw/i386/kvm/ (on the
basis that the non-KVM support is very theoretical and making it not use
KVM directly seems like gratuitous overengineering at this point).

The XEN_BUS option is for the xenfv platform support, which will now be
used both by XEN_EMU and by real Xen.

The XEN option remains dependent on the Xen runtime libraries, and covers
support for real Xen. Some code which currently resides under CONFIG_XEN
will be moving to CONFIG_XEN_BUS over time as the direct dependencies on
Xen runtime libraries are eliminated. The Xen PCI platform device will
also reside under CONFIG_XEN_BUS.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/Kconfig      | 1 +
 hw/i386/Kconfig | 5 +++++
 hw/xen/Kconfig  | 3 +++
 3 files changed, 9 insertions(+)
 create mode 100644 hw/xen/Kconfig

(limited to 'hw')

diff --git a/hw/Kconfig b/hw/Kconfig
index 38233bb..ba62ff6 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -41,6 +41,7 @@ source tpm/Kconfig
 source usb/Kconfig
 source virtio/Kconfig
 source vfio/Kconfig
+source xen/Kconfig
 source watchdog/Kconfig
 
 # arch Kconfig
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 9fbfe74..d40802d 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -136,3 +136,8 @@ config VMPORT
 config VMMOUSE
     bool
     depends on VMPORT
+
+config XEN_EMU
+    bool
+    default y
+    depends on KVM && (I386 || X86_64)
diff --git a/hw/xen/Kconfig b/hw/xen/Kconfig
new file mode 100644
index 0000000..3467efb
--- /dev/null
+++ b/hw/xen/Kconfig
@@ -0,0 +1,3 @@
+config XEN_BUS
+    bool
+    default y if (XEN || XEN_EMU)
-- 
cgit v1.1


From 3bb1ebac6cc372d2eaa12a5e576b2d28f3a29269 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 18 Jan 2019 14:29:52 -0500
Subject: xen-platform: exclude vfio-pci from the PCI platform unplug

Such that PCI passthrough devices work for Xen emulated guests.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/xen/xen_platform.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
index 3795a20..8a61697 100644
--- a/hw/i386/xen/xen_platform.c
+++ b/hw/i386/xen/xen_platform.c
@@ -108,12 +108,25 @@ static void log_writeb(PCIXenPlatformState *s, char val)
 #define _UNPLUG_NVME_DISKS 3
 #define UNPLUG_NVME_DISKS (1u << _UNPLUG_NVME_DISKS)
 
+static bool pci_device_is_passthrough(PCIDevice *d)
+{
+    if (!strcmp(d->name, "xen-pci-passthrough")) {
+        return true;
+    }
+
+    if (xen_mode == XEN_EMULATE && !strcmp(d->name, "vfio-pci")) {
+        return true;
+    }
+
+    return false;
+}
+
 static void unplug_nic(PCIBus *b, PCIDevice *d, void *o)
 {
     /* We have to ignore passthrough devices */
     if (pci_get_word(d->config + PCI_CLASS_DEVICE) ==
             PCI_CLASS_NETWORK_ETHERNET
-            && strcmp(d->name, "xen-pci-passthrough") != 0) {
+            && !pci_device_is_passthrough(d)) {
         object_unparent(OBJECT(d));
     }
 }
@@ -186,9 +199,8 @@ static void unplug_disks(PCIBus *b, PCIDevice *d, void *opaque)
         !(flags & UNPLUG_IDE_SCSI_DISKS);
 
     /* We have to ignore passthrough devices */
-    if (!strcmp(d->name, "xen-pci-passthrough")) {
+    if (pci_device_is_passthrough(d))
         return;
-    }
 
     switch (pci_get_word(d->config + PCI_CLASS_DEVICE)) {
     case PCI_CLASS_STORAGE_IDE:
-- 
cgit v1.1


From bb346fae3827b0d7d241e0304069e18cd8155d05 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Tue, 19 Jun 2018 06:44:46 -0400
Subject: xen-platform: allow its creation with XEN_EMULATE mode

The only thing we need to fix to make this build is the PIO hack which
sets the BIOS memory areas to R/W v.s. R/O. Theoretically we could hook
that up to the PAM registers on the emulated PIIX, but in practice
nobody cares, so just leave it doing nothing.

Now it builds without actual Xen, move it to CONFIG_XEN_BUS to include it
in the KVM-only builds.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/xen/meson.build    |  5 ++++-
 hw/i386/xen/xen_platform.c | 39 ++++++++++++++++++++++++++-------------
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/xen/meson.build b/hw/i386/xen/meson.build
index be84130..2e64a34 100644
--- a/hw/i386/xen/meson.build
+++ b/hw/i386/xen/meson.build
@@ -2,6 +2,9 @@ i386_ss.add(when: 'CONFIG_XEN', if_true: files(
   'xen-hvm.c',
   'xen-mapcache.c',
   'xen_apic.c',
-  'xen_platform.c',
   'xen_pvdevice.c',
 ))
+
+i386_ss.add(when: 'CONFIG_XEN_BUS', if_true: files(
+  'xen_platform.c',
+))
diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
index 8a61697..539f7da 100644
--- a/hw/i386/xen/xen_platform.c
+++ b/hw/i386/xen/xen_platform.c
@@ -27,9 +27,9 @@
 #include "qapi/error.h"
 #include "hw/ide/pci.h"
 #include "hw/pci/pci.h"
-#include "hw/xen/xen_common.h"
 #include "migration/vmstate.h"
-#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen.h"
+#include "net/net.h"
 #include "trace.h"
 #include "sysemu/xen.h"
 #include "sysemu/block-backend.h"
@@ -37,6 +37,11 @@
 #include "qemu/module.h"
 #include "qom/object.h"
 
+#ifdef CONFIG_XEN
+#include "hw/xen/xen_common.h"
+#include "hw/xen/xen-legacy-backend.h"
+#endif
+
 //#define DEBUG_PLATFORM
 
 #ifdef DEBUG_PLATFORM
@@ -279,18 +284,26 @@ static void platform_fixed_ioport_writeb(void *opaque, uint32_t addr, uint32_t v
     PCIXenPlatformState *s = opaque;
 
     switch (addr) {
-    case 0: /* Platform flags */ {
-        hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
-            HVMMEM_ram_ro : HVMMEM_ram_rw;
-        if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
-            DPRINTF("unable to change ro/rw state of ROM memory area!\n");
-        } else {
+    case 0: /* Platform flags */
+        if (xen_mode == XEN_EMULATE) {
+            /* XX: Use i440gx/q35 PAM setup to do this? */
             s->flags = val & PFFLAG_ROM_LOCK;
-            DPRINTF("changed ro/rw state of ROM memory area. now is %s state.\n",
-                    (mem_type == HVMMEM_ram_ro ? "ro":"rw"));
+#ifdef CONFIG_XEN
+        } else {
+            hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
+                HVMMEM_ram_ro : HVMMEM_ram_rw;
+
+            if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
+                DPRINTF("unable to change ro/rw state of ROM memory area!\n");
+            } else {
+                s->flags = val & PFFLAG_ROM_LOCK;
+                DPRINTF("changed ro/rw state of ROM memory area. now is %s state.\n",
+                        (mem_type == HVMMEM_ram_ro ? "ro" : "rw"));
+            }
+#endif
         }
         break;
-    }
+
     case 2:
         log_writeb(s, val);
         break;
@@ -508,8 +521,8 @@ static void xen_platform_realize(PCIDevice *dev, Error **errp)
     uint8_t *pci_conf;
 
     /* Device will crash on reset if xen is not initialized */
-    if (!xen_enabled()) {
-        error_setg(errp, "xen-platform device requires the Xen accelerator");
+    if (xen_mode == XEN_DISABLED) {
+        error_setg(errp, "xen-platform device requires a Xen guest");
         return;
     }
 
-- 
cgit v1.1


From d40ddd529004709350202114fc1c2fc99127647d Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 7 Dec 2022 09:19:31 +0000
Subject: hw/xen: Add xen_overlay device for emulating shared xenheap pages

For the shared info page and for grant tables, Xen shares its own pages
from the "Xen heap" to the guest. The guest requests that a given page
from a certain address space (XENMAPSPACE_shared_info, etc.) be mapped
to a given GPA using the XENMEM_add_to_physmap hypercall.

To support that in qemu when *emulating* Xen, create a memory region
(migratable) and allow it to be mapped as an overlay when requested.

Xen theoretically allows the same page to be mapped multiple times
into the guest, but that's hard to track and reinstate over migration,
so we automatically *unmap* any previous mapping when creating a new
one. This approach has been used in production with.... a non-trivial
number of guests expecting true Xen, without any problems yet being
noticed.

This adds just the shared info page for now. The grant tables will be
a larger region, and will need to be overlaid one page at a time. I
think that means I need to create separate aliases for each page of
the overall grant_frames region, so that they can be mapped individually.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/meson.build   |   1 +
 hw/i386/kvm/xen_overlay.c | 210 ++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_overlay.h |  20 +++++
 3 files changed, 231 insertions(+)
 create mode 100644 hw/i386/kvm/xen_overlay.c
 create mode 100644 hw/i386/kvm/xen_overlay.h

(limited to 'hw')

diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 95467f1..6165cbf 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -4,5 +4,6 @@ i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c'))
 i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
+i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen_overlay.c'))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/xen_overlay.c b/hw/i386/kvm/xen_overlay.c
new file mode 100644
index 0000000..a2441e2
--- /dev/null
+++ b/hw/i386/kvm/xen_overlay.c
@@ -0,0 +1,210 @@
+/*
+ * QEMU Xen emulation: Shared/overlay pages support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_overlay.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+#include <linux/kvm.h>
+
+#include "hw/xen/interface/memory.h"
+
+
+#define TYPE_XEN_OVERLAY "xen-overlay"
+OBJECT_DECLARE_SIMPLE_TYPE(XenOverlayState, XEN_OVERLAY)
+
+#define XEN_PAGE_SHIFT 12
+#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
+
+struct XenOverlayState {
+    /*< private >*/
+    SysBusDevice busdev;
+    /*< public >*/
+
+    MemoryRegion shinfo_mem;
+    void *shinfo_ptr;
+    uint64_t shinfo_gpa;
+};
+
+struct XenOverlayState *xen_overlay_singleton;
+
+static void xen_overlay_do_map_page(MemoryRegion *page, uint64_t gpa)
+{
+    /*
+     * Xen allows guests to map the same page as many times as it likes
+     * into guest physical frames. We don't, because it would be hard
+     * to track and restore them all. One mapping of each page is
+     * perfectly sufficient for all known guests... and we've tested
+     * that theory on a few now in other implementations. dwmw2.
+     */
+    if (memory_region_is_mapped(page)) {
+        if (gpa == INVALID_GPA) {
+            memory_region_del_subregion(get_system_memory(), page);
+        } else {
+            /* Just move it */
+            memory_region_set_address(page, gpa);
+        }
+    } else if (gpa != INVALID_GPA) {
+        memory_region_add_subregion_overlap(get_system_memory(), gpa, page, 0);
+    }
+}
+
+/* KVM is the only existing back end for now. Let's not overengineer it yet. */
+static int xen_overlay_set_be_shinfo(uint64_t gfn)
+{
+    struct kvm_xen_hvm_attr xa = {
+        .type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+        .u.shared_info.gfn = gfn,
+    };
+
+    return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+}
+
+
+static void xen_overlay_realize(DeviceState *dev, Error **errp)
+{
+    XenOverlayState *s = XEN_OVERLAY(dev);
+
+    if (xen_mode != XEN_EMULATE) {
+        error_setg(errp, "Xen overlay page support is for Xen emulation");
+        return;
+    }
+
+    memory_region_init_ram(&s->shinfo_mem, OBJECT(dev), "xen:shared_info",
+                           XEN_PAGE_SIZE, &error_abort);
+    memory_region_set_enabled(&s->shinfo_mem, true);
+
+    s->shinfo_ptr = memory_region_get_ram_ptr(&s->shinfo_mem);
+    s->shinfo_gpa = INVALID_GPA;
+    memset(s->shinfo_ptr, 0, XEN_PAGE_SIZE);
+}
+
+static int xen_overlay_post_load(void *opaque, int version_id)
+{
+    XenOverlayState *s = opaque;
+
+    if (s->shinfo_gpa != INVALID_GPA) {
+        xen_overlay_do_map_page(&s->shinfo_mem, s->shinfo_gpa);
+        xen_overlay_set_be_shinfo(s->shinfo_gpa >> XEN_PAGE_SHIFT);
+    }
+
+    return 0;
+}
+
+static bool xen_overlay_is_needed(void *opaque)
+{
+    return xen_mode == XEN_EMULATE;
+}
+
+static const VMStateDescription xen_overlay_vmstate = {
+    .name = "xen_overlay",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = xen_overlay_is_needed,
+    .post_load = xen_overlay_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(shinfo_gpa, XenOverlayState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void xen_overlay_reset(DeviceState *dev)
+{
+    kvm_xen_soft_reset();
+}
+
+static void xen_overlay_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->reset = xen_overlay_reset;
+    dc->realize = xen_overlay_realize;
+    dc->vmsd = &xen_overlay_vmstate;
+}
+
+static const TypeInfo xen_overlay_info = {
+    .name          = TYPE_XEN_OVERLAY,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(XenOverlayState),
+    .class_init    = xen_overlay_class_init,
+};
+
+void xen_overlay_create(void)
+{
+    xen_overlay_singleton = XEN_OVERLAY(sysbus_create_simple(TYPE_XEN_OVERLAY,
+                                                             -1, NULL));
+
+    /* If xen_domid wasn't explicitly set, at least make sure it isn't zero. */
+    if (xen_domid == DOMID_QEMU) {
+        xen_domid = 1;
+    };
+}
+
+static void xen_overlay_register_types(void)
+{
+    type_register_static(&xen_overlay_info);
+}
+
+type_init(xen_overlay_register_types)
+
+int xen_overlay_map_shinfo_page(uint64_t gpa)
+{
+    XenOverlayState *s = xen_overlay_singleton;
+    int ret;
+
+    if (!s) {
+        return -ENOENT;
+    }
+
+    assert(qemu_mutex_iothread_locked());
+
+    if (s->shinfo_gpa) {
+        /* If removing shinfo page, turn the kernel magic off first */
+        ret = xen_overlay_set_be_shinfo(INVALID_GFN);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    xen_overlay_do_map_page(&s->shinfo_mem, gpa);
+    if (gpa != INVALID_GPA) {
+        ret = xen_overlay_set_be_shinfo(gpa >> XEN_PAGE_SHIFT);
+        if (ret) {
+            return ret;
+        }
+    }
+    s->shinfo_gpa = gpa;
+
+    return 0;
+}
+
+void *xen_overlay_get_shinfo_ptr(void)
+{
+    XenOverlayState *s = xen_overlay_singleton;
+
+    if (!s) {
+        return NULL;
+    }
+
+    return s->shinfo_ptr;
+}
diff --git a/hw/i386/kvm/xen_overlay.h b/hw/i386/kvm/xen_overlay.h
new file mode 100644
index 0000000..00cff05
--- /dev/null
+++ b/hw/i386/kvm/xen_overlay.h
@@ -0,0 +1,20 @@
+/*
+ * QEMU Xen emulation: Shared/overlay pages support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XEN_OVERLAY_H
+#define QEMU_XEN_OVERLAY_H
+
+void xen_overlay_create(void);
+
+int xen_overlay_map_shinfo_page(uint64_t gpa);
+void *xen_overlay_get_shinfo_ptr(void);
+
+#endif /* QEMU_XEN_OVERLAY_H */
-- 
cgit v1.1


From e21be724eaf5dfdf1ac3595e0c808b775ec262f2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 12 Dec 2022 23:40:45 +0000
Subject: i386/xen: add pc_machine_kvm_type to initialize XEN_EMULATE mode

The xen_overlay device (and later similar devices for event channels and
grant tables) need to be instantiated. Do this from a kvm_type method on
the PC machine derivatives, since KVM is only way to support Xen emulation
for now.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/pc.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 992951c..a316a01 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -90,6 +90,7 @@
 #include "hw/virtio/virtio-iommu.h"
 #include "hw/virtio/virtio-pmem-pci.h"
 #include "hw/virtio/virtio-mem-pci.h"
+#include "hw/i386/kvm/xen_overlay.h"
 #include "hw/mem/memory-device.h"
 #include "sysemu/replay.h"
 #include "target/i386/cpu.h"
@@ -1846,6 +1847,16 @@ static void pc_machine_initfn(Object *obj)
     cxl_machine_init(obj, &pcms->cxl_devices_state);
 }
 
+int pc_machine_kvm_type(MachineState *machine, const char *kvm_type)
+{
+#ifdef CONFIG_XEN_EMU
+    if (xen_mode == XEN_EMULATE) {
+        xen_overlay_create();
+    }
+#endif
+    return 0;
+}
+
 static void pc_machine_reset(MachineState *machine, ShutdownCause reason)
 {
     CPUState *cs;
-- 
cgit v1.1


From 110a0ea59f263b6e382ee22c70c31c2364d11eb0 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 12 Dec 2022 14:03:41 +0000
Subject: i386/xen: manage and save/restore Xen guest long_mode setting

Xen will "latch" the guest's 32-bit or 64-bit ("long mode") setting when
the guest writes the MSR to fill in the hypercall page, or when the guest
sets the event channel callback in HVM_PARAM_CALLBACK_IRQ.

KVM handles the former and sets the kernel's long_mode flag accordingly.
The latter will be handled in userspace. Keep them in sync by noticing
when a hypercall is made in a mode that doesn't match qemu's idea of
the guest mode, and resyncing from the kernel. Do that same sync right
before serialization too, in case the guest has set the hypercall page
but hasn't yet made a system call.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_overlay.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_overlay.h |  4 +++
 2 files changed, 66 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_overlay.c b/hw/i386/kvm/xen_overlay.c
index a2441e2..8685d87 100644
--- a/hw/i386/kvm/xen_overlay.c
+++ b/hw/i386/kvm/xen_overlay.c
@@ -44,6 +44,7 @@ struct XenOverlayState {
     MemoryRegion shinfo_mem;
     void *shinfo_ptr;
     uint64_t shinfo_gpa;
+    bool long_mode;
 };
 
 struct XenOverlayState *xen_overlay_singleton;
@@ -96,9 +97,21 @@ static void xen_overlay_realize(DeviceState *dev, Error **errp)
 
     s->shinfo_ptr = memory_region_get_ram_ptr(&s->shinfo_mem);
     s->shinfo_gpa = INVALID_GPA;
+    s->long_mode = false;
     memset(s->shinfo_ptr, 0, XEN_PAGE_SIZE);
 }
 
+static int xen_overlay_pre_save(void *opaque)
+{
+    /*
+     * Fetch the kernel's idea of long_mode to avoid the race condition
+     * where the guest has set the hypercall page up in 64-bit mode but
+     * not yet made a hypercall by the time migration happens, so qemu
+     * hasn't yet noticed.
+     */
+    return xen_sync_long_mode();
+}
+
 static int xen_overlay_post_load(void *opaque, int version_id)
 {
     XenOverlayState *s = opaque;
@@ -107,6 +120,9 @@ static int xen_overlay_post_load(void *opaque, int version_id)
         xen_overlay_do_map_page(&s->shinfo_mem, s->shinfo_gpa);
         xen_overlay_set_be_shinfo(s->shinfo_gpa >> XEN_PAGE_SHIFT);
     }
+    if (s->long_mode) {
+        xen_set_long_mode(true);
+    }
 
     return 0;
 }
@@ -121,9 +137,11 @@ static const VMStateDescription xen_overlay_vmstate = {
     .version_id = 1,
     .minimum_version_id = 1,
     .needed = xen_overlay_is_needed,
+    .pre_save = xen_overlay_pre_save,
     .post_load = xen_overlay_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(shinfo_gpa, XenOverlayState),
+        VMSTATE_BOOL(long_mode, XenOverlayState),
         VMSTATE_END_OF_LIST()
     }
 };
@@ -208,3 +226,47 @@ void *xen_overlay_get_shinfo_ptr(void)
 
     return s->shinfo_ptr;
 }
+
+int xen_sync_long_mode(void)
+{
+    int ret;
+    struct kvm_xen_hvm_attr xa = {
+        .type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+    };
+
+    if (!xen_overlay_singleton) {
+        return -ENOENT;
+    }
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_GET_ATTR, &xa);
+    if (!ret) {
+        xen_overlay_singleton->long_mode = xa.u.long_mode;
+    }
+
+    return ret;
+}
+
+int xen_set_long_mode(bool long_mode)
+{
+    int ret;
+    struct kvm_xen_hvm_attr xa = {
+        .type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+        .u.long_mode = long_mode,
+    };
+
+    if (!xen_overlay_singleton) {
+        return -ENOENT;
+    }
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+    if (!ret) {
+        xen_overlay_singleton->long_mode = xa.u.long_mode;
+    }
+
+    return ret;
+}
+
+bool xen_is_long_mode(void)
+{
+    return xen_overlay_singleton && xen_overlay_singleton->long_mode;
+}
diff --git a/hw/i386/kvm/xen_overlay.h b/hw/i386/kvm/xen_overlay.h
index 00cff05..5c46a0b 100644
--- a/hw/i386/kvm/xen_overlay.h
+++ b/hw/i386/kvm/xen_overlay.h
@@ -17,4 +17,8 @@ void xen_overlay_create(void);
 int xen_overlay_map_shinfo_page(uint64_t gpa);
 void *xen_overlay_get_shinfo_ptr(void);
 
+int xen_sync_long_mode(void);
+int xen_set_long_mode(bool long_mode);
+bool xen_is_long_mode(void);
+
 #endif /* QEMU_XEN_OVERLAY_H */
-- 
cgit v1.1


From 91cce756179195f0725f551a5b2922ae2abbef9f Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 16 Dec 2022 14:02:29 +0000
Subject: hw/xen: Add xen_evtchn device for event channel emulation

Include basic support for setting HVM_PARAM_CALLBACK_IRQ to the global
vector method HVM_PARAM_CALLBACK_TYPE_VECTOR, which is handled in-kernel
by raising the vector whenever the vCPU's vcpu_info->evtchn_upcall_pending
flag is set.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/meson.build  |   5 +-
 hw/i386/kvm/xen_evtchn.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  18 ++++++
 hw/i386/pc.c             |   2 +
 4 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 hw/i386/kvm/xen_evtchn.c
 create mode 100644 hw/i386/kvm/xen_evtchn.h

(limited to 'hw')

diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 6165cbf..cab64df 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -4,6 +4,9 @@ i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c'))
 i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
-i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen_overlay.c'))
+i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
+  'xen_overlay.c',
+  'xen_evtchn.c',
+  ))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
new file mode 100644
index 0000000..9d6f407
--- /dev/null
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -0,0 +1,155 @@
+/*
+ * QEMU Xen emulation: Event channel support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_evtchn.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+#include <linux/kvm.h>
+
+#include "hw/xen/interface/memory.h"
+#include "hw/xen/interface/hvm/params.h"
+
+#define TYPE_XEN_EVTCHN "xen-evtchn"
+OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN)
+
+struct XenEvtchnState {
+    /*< private >*/
+    SysBusDevice busdev;
+    /*< public >*/
+
+    uint64_t callback_param;
+    bool evtchn_in_kernel;
+
+    QemuMutex port_lock;
+};
+
+struct XenEvtchnState *xen_evtchn_singleton;
+
+/* Top bits of callback_param are the type (HVM_PARAM_CALLBACK_TYPE_xxx) */
+#define CALLBACK_VIA_TYPE_SHIFT 56
+
+static int xen_evtchn_post_load(void *opaque, int version_id)
+{
+    XenEvtchnState *s = opaque;
+
+    if (s->callback_param) {
+        xen_evtchn_set_callback_param(s->callback_param);
+    }
+
+    return 0;
+}
+
+static bool xen_evtchn_is_needed(void *opaque)
+{
+    return xen_mode == XEN_EMULATE;
+}
+
+static const VMStateDescription xen_evtchn_vmstate = {
+    .name = "xen_evtchn",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = xen_evtchn_is_needed,
+    .post_load = xen_evtchn_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(callback_param, XenEvtchnState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void xen_evtchn_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->vmsd = &xen_evtchn_vmstate;
+}
+
+static const TypeInfo xen_evtchn_info = {
+    .name          = TYPE_XEN_EVTCHN,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(XenEvtchnState),
+    .class_init    = xen_evtchn_class_init,
+};
+
+void xen_evtchn_create(void)
+{
+    XenEvtchnState *s = XEN_EVTCHN(sysbus_create_simple(TYPE_XEN_EVTCHN,
+                                                        -1, NULL));
+    xen_evtchn_singleton = s;
+
+    qemu_mutex_init(&s->port_lock);
+}
+
+static void xen_evtchn_register_types(void)
+{
+    type_register_static(&xen_evtchn_info);
+}
+
+type_init(xen_evtchn_register_types)
+
+int xen_evtchn_set_callback_param(uint64_t param)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    struct kvm_xen_hvm_attr xa = {
+        .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
+        .u.vector = 0,
+    };
+    bool in_kernel = false;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    switch (param >> CALLBACK_VIA_TYPE_SHIFT) {
+    case HVM_PARAM_CALLBACK_TYPE_VECTOR: {
+        xa.u.vector = (uint8_t)param,
+
+        ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+        if (!ret && kvm_xen_has_cap(EVTCHN_SEND)) {
+            in_kernel = true;
+        }
+        break;
+    }
+    default:
+        /* Xen doesn't return error even if you set something bogus */
+        ret = 0;
+        break;
+    }
+
+    if (!ret) {
+        /* If vector delivery was turned *off* then tell the kernel */
+        if ((s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) ==
+            HVM_PARAM_CALLBACK_TYPE_VECTOR && !xa.u.vector) {
+            kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+        }
+        s->callback_param = param;
+        s->evtchn_in_kernel = in_kernel;
+    }
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
new file mode 100644
index 0000000..c9b7f9d
--- /dev/null
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -0,0 +1,18 @@
+/*
+ * QEMU Xen emulation: Event channel support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XEN_EVTCHN_H
+#define QEMU_XEN_EVTCHN_H
+
+void xen_evtchn_create(void);
+int xen_evtchn_set_callback_param(uint64_t param);
+
+#endif /* QEMU_XEN_EVTCHN_H */
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index a316a01..f38e3f1 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -91,6 +91,7 @@
 #include "hw/virtio/virtio-pmem-pci.h"
 #include "hw/virtio/virtio-mem-pci.h"
 #include "hw/i386/kvm/xen_overlay.h"
+#include "hw/i386/kvm/xen_evtchn.h"
 #include "hw/mem/memory-device.h"
 #include "sysemu/replay.h"
 #include "target/i386/cpu.h"
@@ -1852,6 +1853,7 @@ int pc_machine_kvm_type(MachineState *machine, const char *kvm_type)
 #ifdef CONFIG_XEN_EMU
     if (xen_mode == XEN_EMULATE) {
         xen_overlay_create();
+        xen_evtchn_create();
     }
 #endif
     return 0;
-- 
cgit v1.1


From 4858ba2065c8bddf7f1c056eda3a5357588bdd50 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 13 Dec 2022 13:29:46 +0000
Subject: hw/xen: Implement EVTCHNOP_status

This adds the basic structure for maintaining the port table and reporting
the status of ports therein.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |   3 ++
 2 files changed, 107 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 9d6f407..8bed338 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -22,6 +22,7 @@
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
 #include "xen_evtchn.h"
+#include "xen_overlay.h"
 
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_xen.h"
@@ -33,6 +34,22 @@
 #define TYPE_XEN_EVTCHN "xen-evtchn"
 OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN)
 
+typedef struct XenEvtchnPort {
+    uint32_t vcpu;      /* Xen/ACPI vcpu_id */
+    uint16_t type;      /* EVTCHNSTAT_xxxx */
+    uint16_t type_val;  /* pirq# / virq# / remote port according to type */
+} XenEvtchnPort;
+
+#define COMPAT_EVTCHN_2L_NR_CHANNELS            1024
+
+/*
+ * For unbound/interdomain ports there are only two possible remote
+ * domains; self and QEMU. Use a single high bit in type_val for that,
+ * and the low bits for the remote port number (or 0 for unbound).
+ */
+#define PORT_INFO_TYPEVAL_REMOTE_QEMU           0x8000
+#define PORT_INFO_TYPEVAL_REMOTE_PORT_MASK      0x7FFF
+
 struct XenEvtchnState {
     /*< private >*/
     SysBusDevice busdev;
@@ -42,6 +59,8 @@ struct XenEvtchnState {
     bool evtchn_in_kernel;
 
     QemuMutex port_lock;
+    uint32_t nr_ports;
+    XenEvtchnPort port_table[EVTCHN_2L_NR_CHANNELS];
 };
 
 struct XenEvtchnState *xen_evtchn_singleton;
@@ -65,6 +84,18 @@ static bool xen_evtchn_is_needed(void *opaque)
     return xen_mode == XEN_EMULATE;
 }
 
+static const VMStateDescription xen_evtchn_port_vmstate = {
+    .name = "xen_evtchn_port",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(vcpu, XenEvtchnPort),
+        VMSTATE_UINT16(type, XenEvtchnPort),
+        VMSTATE_UINT16(type_val, XenEvtchnPort),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription xen_evtchn_vmstate = {
     .name = "xen_evtchn",
     .version_id = 1,
@@ -73,6 +104,9 @@ static const VMStateDescription xen_evtchn_vmstate = {
     .post_load = xen_evtchn_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(callback_param, XenEvtchnState),
+        VMSTATE_UINT32(nr_ports, XenEvtchnState),
+        VMSTATE_STRUCT_VARRAY_UINT32(port_table, XenEvtchnState, nr_ports, 1,
+                                     xen_evtchn_port_vmstate, XenEvtchnPort),
         VMSTATE_END_OF_LIST()
     }
 };
@@ -153,3 +187,73 @@ int xen_evtchn_set_callback_param(uint64_t param)
 
     return ret;
 }
+
+static bool valid_port(evtchn_port_t port)
+{
+    if (!port) {
+        return false;
+    }
+
+    if (xen_is_long_mode()) {
+        return port < EVTCHN_2L_NR_CHANNELS;
+    } else {
+        return port < COMPAT_EVTCHN_2L_NR_CHANNELS;
+    }
+}
+
+int xen_evtchn_status_op(struct evtchn_status *status)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    XenEvtchnPort *p;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (status->dom != DOMID_SELF && status->dom != xen_domid) {
+        return -ESRCH;
+    }
+
+    if (!valid_port(status->port)) {
+        return -EINVAL;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    p = &s->port_table[status->port];
+
+    status->status = p->type;
+    status->vcpu = p->vcpu;
+
+    switch (p->type) {
+    case EVTCHNSTAT_unbound:
+        if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+            status->u.unbound.dom = DOMID_QEMU;
+        } else {
+            status->u.unbound.dom = xen_domid;
+        }
+        break;
+
+    case EVTCHNSTAT_interdomain:
+        if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+            status->u.interdomain.dom = DOMID_QEMU;
+        } else {
+            status->u.interdomain.dom = xen_domid;
+        }
+
+        status->u.interdomain.port = p->type_val &
+            PORT_INFO_TYPEVAL_REMOTE_PORT_MASK;
+        break;
+
+    case EVTCHNSTAT_pirq:
+        status->u.pirq = p->type_val;
+        break;
+
+    case EVTCHNSTAT_virq:
+        status->u.virq = p->type_val;
+        break;
+    }
+
+    qemu_mutex_unlock(&s->port_lock);
+    return 0;
+}
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index c9b7f9d..7646763 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -15,4 +15,7 @@
 void xen_evtchn_create(void);
 int xen_evtchn_set_callback_param(uint64_t param);
 
+struct evtchn_status;
+int xen_evtchn_status_op(struct evtchn_status *status);
+
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From 83eb5811342b4e48544ab79dd92a0bb1a2a2cbfb Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 13 Dec 2022 13:57:44 +0000
Subject: hw/xen: Implement EVTCHNOP_close

It calls an internal close_port() helper which will also be used from
EVTCHNOP_reset and will actually do the work to disconnect/unbind a port
once any of that is actually implemented in the first place.

That in turn calls a free_port() internal function which will be in
error paths after allocation.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |   2 +
 2 files changed, 125 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 8bed338..c57d36b 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -21,6 +21,7 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+
 #include "xen_evtchn.h"
 #include "xen_overlay.h"
 
@@ -40,6 +41,41 @@ typedef struct XenEvtchnPort {
     uint16_t type_val;  /* pirq# / virq# / remote port according to type */
 } XenEvtchnPort;
 
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
+struct compat_arch_vcpu_info {
+    unsigned int cr2;
+    unsigned int pad[5];
+};
+
+struct compat_vcpu_info {
+    uint8_t evtchn_upcall_pending;
+    uint8_t evtchn_upcall_mask;
+    uint16_t pad;
+    uint32_t evtchn_pending_sel;
+    struct compat_arch_vcpu_info arch;
+    struct vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct compat_arch_shared_info {
+    unsigned int max_pfn;
+    unsigned int pfn_to_mfn_frame_list_list;
+    unsigned int nmi_reason;
+    unsigned int p2m_cr3;
+    unsigned int p2m_vaddr;
+    unsigned int p2m_generation;
+    uint32_t wc_sec_hi;
+};
+
+struct compat_shared_info {
+    struct compat_vcpu_info vcpu_info[XEN_LEGACY_MAX_VCPUS];
+    uint32_t evtchn_pending[32];
+    uint32_t evtchn_mask[32];
+    uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
+    uint32_t wc_sec;
+    uint32_t wc_nsec;
+    struct compat_arch_shared_info arch;
+};
+
 #define COMPAT_EVTCHN_2L_NR_CHANNELS            1024
 
 /*
@@ -257,3 +293,90 @@ int xen_evtchn_status_op(struct evtchn_status *status)
     qemu_mutex_unlock(&s->port_lock);
     return 0;
 }
+
+static int clear_port_pending(XenEvtchnState *s, evtchn_port_t port)
+{
+    void *p = xen_overlay_get_shinfo_ptr();
+
+    if (!p) {
+        return -ENOTSUP;
+    }
+
+    if (xen_is_long_mode()) {
+        struct shared_info *shinfo = p;
+        const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+        typeof(shinfo->evtchn_pending[0]) mask;
+        int idx = port / bits_per_word;
+        int offset = port % bits_per_word;
+
+        mask = 1UL << offset;
+
+        qatomic_fetch_and(&shinfo->evtchn_pending[idx], ~mask);
+    } else {
+        struct compat_shared_info *shinfo = p;
+        const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+        typeof(shinfo->evtchn_pending[0]) mask;
+        int idx = port / bits_per_word;
+        int offset = port % bits_per_word;
+
+        mask = 1UL << offset;
+
+        qatomic_fetch_and(&shinfo->evtchn_pending[idx], ~mask);
+    }
+    return 0;
+}
+
+static void free_port(XenEvtchnState *s, evtchn_port_t port)
+{
+    s->port_table[port].type = EVTCHNSTAT_closed;
+    s->port_table[port].type_val = 0;
+    s->port_table[port].vcpu = 0;
+
+    if (s->nr_ports == port + 1) {
+        do {
+            s->nr_ports--;
+        } while (s->nr_ports &&
+                 s->port_table[s->nr_ports - 1].type == EVTCHNSTAT_closed);
+    }
+
+    /* Clear pending event to avoid unexpected behavior on re-bind. */
+    clear_port_pending(s, port);
+}
+
+static int close_port(XenEvtchnState *s, evtchn_port_t port)
+{
+    XenEvtchnPort *p = &s->port_table[port];
+
+    switch (p->type) {
+    case EVTCHNSTAT_closed:
+        return -ENOENT;
+
+    default:
+        break;
+    }
+
+    free_port(s, port);
+    return 0;
+}
+
+int xen_evtchn_close_op(struct evtchn_close *close)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (!valid_port(close->port)) {
+        return -EINVAL;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    ret = close_port(s, close->port);
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 7646763..cb39249 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -16,6 +16,8 @@ void xen_evtchn_create(void);
 int xen_evtchn_set_callback_param(uint64_t param);
 
 struct evtchn_status;
+struct evtchn_close;
 int xen_evtchn_status_op(struct evtchn_status *status);
+int xen_evtchn_close_op(struct evtchn_close *close);
 
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From 190cc3c0edd60876150959ee9f242363e40d168e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 13 Dec 2022 17:20:46 +0000
Subject: hw/xen: Implement EVTCHNOP_unmask

This finally comes with a mechanism for actually injecting events into
the guest vCPU, with all the atomic-test-and-set that's involved in
setting the bit in the shinfo, then the index in the vcpu_info, and
injecting either the lapic vector as MSI, or letting KVM inject the
bare vector.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |   2 +
 2 files changed, 177 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index c57d36b..54707f1 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -224,6 +224,13 @@ int xen_evtchn_set_callback_param(uint64_t param)
     return ret;
 }
 
+static void inject_callback(XenEvtchnState *s, uint32_t vcpu)
+{
+    int type = s->callback_param >> CALLBACK_VIA_TYPE_SHIFT;
+
+    kvm_xen_inject_vcpu_callback_vector(vcpu, type);
+}
+
 static bool valid_port(evtchn_port_t port)
 {
     if (!port) {
@@ -294,6 +301,152 @@ int xen_evtchn_status_op(struct evtchn_status *status)
     return 0;
 }
 
+/*
+ * Never thought I'd hear myself say this, but C++ templates would be
+ * kind of nice here.
+ *
+ * template<class T> static int do_unmask_port(T *shinfo, ...);
+ */
+static int do_unmask_port_lm(XenEvtchnState *s, evtchn_port_t port,
+                             bool do_unmask, struct shared_info *shinfo,
+                             struct vcpu_info *vcpu_info)
+{
+    const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+    typeof(shinfo->evtchn_pending[0]) mask;
+    int idx = port / bits_per_word;
+    int offset = port % bits_per_word;
+
+    mask = 1UL << offset;
+
+    if (idx >= bits_per_word) {
+        return -EINVAL;
+    }
+
+    if (do_unmask) {
+        /*
+         * If this is a true unmask operation, clear the mask bit. If
+         * it was already unmasked, we have nothing further to do.
+         */
+        if (!((qatomic_fetch_and(&shinfo->evtchn_mask[idx], ~mask) & mask))) {
+            return 0;
+        }
+    } else {
+        /*
+         * This is a pseudo-unmask for affinity changes. We don't
+         * change the mask bit, and if it's *masked* we have nothing
+         * else to do.
+         */
+        if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+            return 0;
+        }
+    }
+
+    /* If the event was not pending, we're done. */
+    if (!(qatomic_fetch_or(&shinfo->evtchn_pending[idx], 0) & mask)) {
+        return 0;
+    }
+
+    /* Now on to the vcpu_info evtchn_pending_sel index... */
+    mask = 1UL << idx;
+
+    /* If a port in this word was already pending for this vCPU, all done. */
+    if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+        return 0;
+    }
+
+    /* Set evtchn_upcall_pending for this vCPU */
+    if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) {
+        return 0;
+    }
+
+    inject_callback(s, s->port_table[port].vcpu);
+
+    return 0;
+}
+
+static int do_unmask_port_compat(XenEvtchnState *s, evtchn_port_t port,
+                                 bool do_unmask,
+                                 struct compat_shared_info *shinfo,
+                                 struct compat_vcpu_info *vcpu_info)
+{
+    const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+    typeof(shinfo->evtchn_pending[0]) mask;
+    int idx = port / bits_per_word;
+    int offset = port % bits_per_word;
+
+    mask = 1UL << offset;
+
+    if (idx >= bits_per_word) {
+        return -EINVAL;
+    }
+
+    if (do_unmask) {
+        /*
+         * If this is a true unmask operation, clear the mask bit. If
+         * it was already unmasked, we have nothing further to do.
+         */
+        if (!((qatomic_fetch_and(&shinfo->evtchn_mask[idx], ~mask) & mask))) {
+            return 0;
+        }
+    } else {
+        /*
+         * This is a pseudo-unmask for affinity changes. We don't
+         * change the mask bit, and if it's *masked* we have nothing
+         * else to do.
+         */
+        if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+            return 0;
+        }
+    }
+
+    /* If the event was not pending, we're done. */
+    if (!(qatomic_fetch_or(&shinfo->evtchn_pending[idx], 0) & mask)) {
+        return 0;
+    }
+
+    /* Now on to the vcpu_info evtchn_pending_sel index... */
+    mask = 1UL << idx;
+
+    /* If a port in this word was already pending for this vCPU, all done. */
+    if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+        return 0;
+    }
+
+    /* Set evtchn_upcall_pending for this vCPU */
+    if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) {
+        return 0;
+    }
+
+    inject_callback(s, s->port_table[port].vcpu);
+
+    return 0;
+}
+
+static int unmask_port(XenEvtchnState *s, evtchn_port_t port, bool do_unmask)
+{
+    void *vcpu_info, *shinfo;
+
+    if (s->port_table[port].type == EVTCHNSTAT_closed) {
+        return -EINVAL;
+    }
+
+    shinfo = xen_overlay_get_shinfo_ptr();
+    if (!shinfo) {
+        return -ENOTSUP;
+    }
+
+    vcpu_info = kvm_xen_get_vcpu_info_hva(s->port_table[port].vcpu);
+    if (!vcpu_info) {
+        return -EINVAL;
+    }
+
+    if (xen_is_long_mode()) {
+        return do_unmask_port_lm(s, port, do_unmask, shinfo, vcpu_info);
+    } else {
+        return do_unmask_port_compat(s, port, do_unmask, shinfo, vcpu_info);
+    }
+}
+
 static int clear_port_pending(XenEvtchnState *s, evtchn_port_t port)
 {
     void *p = xen_overlay_get_shinfo_ptr();
@@ -380,3 +533,25 @@ int xen_evtchn_close_op(struct evtchn_close *close)
 
     return ret;
 }
+
+int xen_evtchn_unmask_op(struct evtchn_unmask *unmask)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (!valid_port(unmask->port)) {
+        return -EINVAL;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    ret = unmask_port(s, unmask->port, true);
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index cb39249..69c6b0d 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -17,7 +17,9 @@ int xen_evtchn_set_callback_param(uint64_t param);
 
 struct evtchn_status;
 struct evtchn_close;
+struct evtchn_unmask;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
+int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
 
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From c723d4c15e224e9d058384eecd30559580836c7a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 13 Dec 2022 22:40:56 +0000
Subject: hw/xen: Implement EVTCHNOP_bind_virq

Add the array of virq ports to each vCPU so that we can deliver timers,
debug ports, etc. Global virqs are allocated against vCPU 0 initially,
but can be migrated to other vCPUs (when we implement that).

The kernel needs to know about VIRQ_TIMER in order to accelerate timers,
so tell it via KVM_XEN_VCPU_ATTR_TYPE_TIMER. Also save/restore the value
of the singleshot timer across migration, as the kernel will handle the
hypercalls automatically now.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  2 ++
 2 files changed, 87 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 54707f1..a3202d3 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -244,6 +244,11 @@ static bool valid_port(evtchn_port_t port)
     }
 }
 
+static bool valid_vcpu(uint32_t vcpu)
+{
+    return !!qemu_get_cpu(vcpu);
+}
+
 int xen_evtchn_status_op(struct evtchn_status *status)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
@@ -496,6 +501,43 @@ static void free_port(XenEvtchnState *s, evtchn_port_t port)
     clear_port_pending(s, port);
 }
 
+static int allocate_port(XenEvtchnState *s, uint32_t vcpu, uint16_t type,
+                         uint16_t val, evtchn_port_t *port)
+{
+    evtchn_port_t p = 1;
+
+    for (p = 1; valid_port(p); p++) {
+        if (s->port_table[p].type == EVTCHNSTAT_closed) {
+            s->port_table[p].vcpu = vcpu;
+            s->port_table[p].type = type;
+            s->port_table[p].type_val = val;
+
+            *port = p;
+
+            if (s->nr_ports < p + 1) {
+                s->nr_ports = p + 1;
+            }
+
+            return 0;
+        }
+    }
+    return -ENOSPC;
+}
+
+static bool virq_is_global(uint32_t virq)
+{
+    switch (virq) {
+    case VIRQ_TIMER:
+    case VIRQ_DEBUG:
+    case VIRQ_XENOPROF:
+    case VIRQ_XENPMU:
+        return false;
+
+    default:
+        return true;
+    }
+}
+
 static int close_port(XenEvtchnState *s, evtchn_port_t port)
 {
     XenEvtchnPort *p = &s->port_table[port];
@@ -504,6 +546,11 @@ static int close_port(XenEvtchnState *s, evtchn_port_t port)
     case EVTCHNSTAT_closed:
         return -ENOENT;
 
+    case EVTCHNSTAT_virq:
+        kvm_xen_set_vcpu_virq(virq_is_global(p->type_val) ? 0 : p->vcpu,
+                              p->type_val, 0);
+        break;
+
     default:
         break;
     }
@@ -555,3 +602,41 @@ int xen_evtchn_unmask_op(struct evtchn_unmask *unmask)
 
     return ret;
 }
+
+int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (virq->virq >= NR_VIRQS) {
+        return -EINVAL;
+    }
+
+    /* Global VIRQ must be allocated on vCPU0 first */
+    if (virq_is_global(virq->virq) && virq->vcpu != 0) {
+        return -EINVAL;
+    }
+
+    if (!valid_vcpu(virq->vcpu)) {
+        return -ENOENT;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    ret = allocate_port(s, virq->vcpu, EVTCHNSTAT_virq, virq->virq,
+                        &virq->port);
+    if (!ret) {
+        ret = kvm_xen_set_vcpu_virq(virq->vcpu, virq->virq, virq->port);
+        if (ret) {
+            free_port(s, virq->port);
+        }
+    }
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 69c6b0d..0ea13dd 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -18,8 +18,10 @@ int xen_evtchn_set_callback_param(uint64_t param);
 struct evtchn_status;
 struct evtchn_close;
 struct evtchn_unmask;
+struct evtchn_bind_virq;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
+int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
 
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From f5417856d25f1cf4932ac77a687979af8427f313 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 13 Dec 2022 23:12:59 +0000
Subject: hw/xen: Implement EVTCHNOP_bind_ipi

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  2 ++
 2 files changed, 71 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index a3202d3..eea80dc 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -13,6 +13,7 @@
 #include "qemu/host-utils.h"
 #include "qemu/module.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
 #include "qapi/error.h"
 #include "qom/object.h"
 #include "exec/target_page.h"
@@ -231,6 +232,43 @@ static void inject_callback(XenEvtchnState *s, uint32_t vcpu)
     kvm_xen_inject_vcpu_callback_vector(vcpu, type);
 }
 
+static void deassign_kernel_port(evtchn_port_t port)
+{
+    struct kvm_xen_hvm_attr ha;
+    int ret;
+
+    ha.type = KVM_XEN_ATTR_TYPE_EVTCHN;
+    ha.u.evtchn.send_port = port;
+    ha.u.evtchn.flags = KVM_XEN_EVTCHN_DEASSIGN;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha);
+    if (ret) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Failed to unbind kernel port %d: %s\n",
+                      port, strerror(ret));
+    }
+}
+
+static int assign_kernel_port(uint16_t type, evtchn_port_t port,
+                              uint32_t vcpu_id)
+{
+    CPUState *cpu = qemu_get_cpu(vcpu_id);
+    struct kvm_xen_hvm_attr ha;
+
+    if (!cpu) {
+        return -ENOENT;
+    }
+
+    ha.type = KVM_XEN_ATTR_TYPE_EVTCHN;
+    ha.u.evtchn.send_port = port;
+    ha.u.evtchn.type = type;
+    ha.u.evtchn.flags = 0;
+    ha.u.evtchn.deliver.port.port = port;
+    ha.u.evtchn.deliver.port.vcpu = kvm_arch_vcpu_id(cpu);
+    ha.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+    return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha);
+}
+
 static bool valid_port(evtchn_port_t port)
 {
     if (!port) {
@@ -551,6 +589,12 @@ static int close_port(XenEvtchnState *s, evtchn_port_t port)
                               p->type_val, 0);
         break;
 
+    case EVTCHNSTAT_ipi:
+        if (s->evtchn_in_kernel) {
+            deassign_kernel_port(port);
+        }
+        break;
+
     default:
         break;
     }
@@ -640,3 +684,28 @@ int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq)
 
     return ret;
 }
+
+int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (!valid_vcpu(ipi->vcpu)) {
+        return -ENOENT;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    ret = allocate_port(s, ipi->vcpu, EVTCHNSTAT_ipi, 0, &ipi->port);
+    if (!ret && s->evtchn_in_kernel) {
+        assign_kernel_port(EVTCHNSTAT_ipi, ipi->port, ipi->vcpu);
+    }
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 0ea13dd..107f420 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -19,9 +19,11 @@ struct evtchn_status;
 struct evtchn_close;
 struct evtchn_unmask;
 struct evtchn_bind_virq;
+struct evtchn_bind_ipi;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
 int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
+int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
 
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From cf7679abddd05712c5cfbe5516328586eded22c3 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 14 Dec 2022 00:11:07 +0000
Subject: hw/xen: Implement EVTCHNOP_send

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |   2 +
 2 files changed, 182 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index eea80dc..ce7a969 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -490,6 +490,133 @@ static int unmask_port(XenEvtchnState *s, evtchn_port_t port, bool do_unmask)
     }
 }
 
+static int do_set_port_lm(XenEvtchnState *s, evtchn_port_t port,
+                          struct shared_info *shinfo,
+                          struct vcpu_info *vcpu_info)
+{
+    const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+    typeof(shinfo->evtchn_pending[0]) mask;
+    int idx = port / bits_per_word;
+    int offset = port % bits_per_word;
+
+    mask = 1UL << offset;
+
+    if (idx >= bits_per_word) {
+        return -EINVAL;
+    }
+
+    /* Update the pending bit itself. If it was already set, we're done. */
+    if (qatomic_fetch_or(&shinfo->evtchn_pending[idx], mask) & mask) {
+        return 0;
+    }
+
+    /* Check if it's masked. */
+    if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+        return 0;
+    }
+
+    /* Now on to the vcpu_info evtchn_pending_sel index... */
+    mask = 1UL << idx;
+
+    /* If a port in this word was already pending for this vCPU, all done. */
+    if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+        return 0;
+    }
+
+    /* Set evtchn_upcall_pending for this vCPU */
+    if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) {
+        return 0;
+    }
+
+    inject_callback(s, s->port_table[port].vcpu);
+
+    return 0;
+}
+
+static int do_set_port_compat(XenEvtchnState *s, evtchn_port_t port,
+                              struct compat_shared_info *shinfo,
+                              struct compat_vcpu_info *vcpu_info)
+{
+    const int bits_per_word = BITS_PER_BYTE * sizeof(shinfo->evtchn_pending[0]);
+    typeof(shinfo->evtchn_pending[0]) mask;
+    int idx = port / bits_per_word;
+    int offset = port % bits_per_word;
+
+    mask = 1UL << offset;
+
+    if (idx >= bits_per_word) {
+        return -EINVAL;
+    }
+
+    /* Update the pending bit itself. If it was already set, we're done. */
+    if (qatomic_fetch_or(&shinfo->evtchn_pending[idx], mask) & mask) {
+        return 0;
+    }
+
+    /* Check if it's masked. */
+    if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+        return 0;
+    }
+
+    /* Now on to the vcpu_info evtchn_pending_sel index... */
+    mask = 1UL << idx;
+
+    /* If a port in this word was already pending for this vCPU, all done. */
+    if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+        return 0;
+    }
+
+    /* Set evtchn_upcall_pending for this vCPU */
+    if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) {
+        return 0;
+    }
+
+    inject_callback(s, s->port_table[port].vcpu);
+
+    return 0;
+}
+
+static int set_port_pending(XenEvtchnState *s, evtchn_port_t port)
+{
+    void *vcpu_info, *shinfo;
+
+    if (s->port_table[port].type == EVTCHNSTAT_closed) {
+        return -EINVAL;
+    }
+
+    if (s->evtchn_in_kernel) {
+        XenEvtchnPort *p = &s->port_table[port];
+        CPUState *cpu = qemu_get_cpu(p->vcpu);
+        struct kvm_irq_routing_xen_evtchn evt;
+
+        if (!cpu) {
+            return 0;
+        }
+
+        evt.port = port;
+        evt.vcpu = kvm_arch_vcpu_id(cpu);
+        evt.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+        return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_EVTCHN_SEND, &evt);
+    }
+
+    shinfo = xen_overlay_get_shinfo_ptr();
+    if (!shinfo) {
+        return -ENOTSUP;
+    }
+
+    vcpu_info = kvm_xen_get_vcpu_info_hva(s->port_table[port].vcpu);
+    if (!vcpu_info) {
+        return -EINVAL;
+    }
+
+    if (xen_is_long_mode()) {
+        return do_set_port_lm(s, port, shinfo, vcpu_info);
+    } else {
+        return do_set_port_compat(s, port, shinfo, vcpu_info);
+    }
+}
+
 static int clear_port_pending(XenEvtchnState *s, evtchn_port_t port)
 {
     void *p = xen_overlay_get_shinfo_ptr();
@@ -709,3 +836,56 @@ int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
 
     return ret;
 }
+
+int xen_evtchn_send_op(struct evtchn_send *send)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    XenEvtchnPort *p;
+    int ret = 0;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (!valid_port(send->port)) {
+        return -EINVAL;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    p = &s->port_table[send->port];
+
+    switch (p->type) {
+    case EVTCHNSTAT_interdomain:
+        if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+            /*
+             * This is an event from the guest to qemu itself, which is
+             * serving as the driver domain. Not yet implemented; it will
+             * be hooked up to the qemu implementation of xenstore,
+             * console, PV net/block drivers etc.
+             */
+            ret = -ENOSYS;
+        } else {
+            /* Loopback interdomain ports; just a complex IPI */
+            set_port_pending(s, p->type_val);
+        }
+        break;
+
+    case EVTCHNSTAT_ipi:
+        set_port_pending(s, send->port);
+        break;
+
+    case EVTCHNSTAT_unbound:
+        /* Xen will silently drop these */
+        break;
+
+    default:
+        ret = -EINVAL;
+        break;
+    }
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
+
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 107f420..500fdbe 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -20,10 +20,12 @@ struct evtchn_close;
 struct evtchn_unmask;
 struct evtchn_bind_virq;
 struct evtchn_bind_ipi;
+struct evtchn_send;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
 int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
 int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
+int xen_evtchn_send_op(struct evtchn_send *send);
 
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From e1db61b87b0e479bcacdf0c17656c8ee563cc5ba Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 14 Dec 2022 16:39:48 +0000
Subject: hw/xen: Implement EVTCHNOP_alloc_unbound

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 32 ++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  2 ++
 2 files changed, 34 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index ce7a969..df25bf0 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -837,6 +837,38 @@ int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
     return ret;
 }
 
+int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    uint16_t type_val;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (alloc->dom != DOMID_SELF && alloc->dom != xen_domid) {
+        return -ESRCH;
+    }
+
+    if (alloc->remote_dom == DOMID_QEMU) {
+        type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+    } else if (alloc->remote_dom == DOMID_SELF ||
+               alloc->remote_dom == xen_domid) {
+        type_val = 0;
+    } else {
+        return -EPERM;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val, &alloc->port);
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
+
 int xen_evtchn_send_op(struct evtchn_send *send)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 500fdbe..fc08013 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -21,11 +21,13 @@ struct evtchn_unmask;
 struct evtchn_bind_virq;
 struct evtchn_bind_ipi;
 struct evtchn_send;
+struct evtchn_alloc_unbound;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
 int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
 int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
 int xen_evtchn_send_op(struct evtchn_send *send);
+int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
 
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From 8432788104b6b4843738edbc5cf084b2a3687ee9 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 14 Dec 2022 17:26:32 +0000
Subject: hw/xen: Implement EVTCHNOP_bind_interdomain

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  2 ++
 2 files changed, 80 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index df25bf0..e2e36d9 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -722,6 +722,23 @@ static int close_port(XenEvtchnState *s, evtchn_port_t port)
         }
         break;
 
+    case EVTCHNSTAT_interdomain:
+        if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+            /* Not yet implemented. This can't happen! */
+        } else {
+            /* Loopback interdomain */
+            XenEvtchnPort *rp = &s->port_table[p->type_val];
+            if (!valid_port(p->type_val) || rp->type_val != port ||
+                rp->type != EVTCHNSTAT_interdomain) {
+                error_report("Inconsistent state for interdomain unbind");
+            } else {
+                /* Set the other end back to unbound */
+                rp->type = EVTCHNSTAT_unbound;
+                rp->type_val = 0;
+            }
+        }
+        break;
+
     default:
         break;
     }
@@ -837,6 +854,67 @@ int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
     return ret;
 }
 
+int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    uint16_t type_val;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (interdomain->remote_dom == DOMID_QEMU) {
+        type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+    } else if (interdomain->remote_dom == DOMID_SELF ||
+               interdomain->remote_dom == xen_domid) {
+        type_val = 0;
+    } else {
+        return -ESRCH;
+    }
+
+    if (!valid_port(interdomain->remote_port)) {
+        return -EINVAL;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    /* The newly allocated port starts out as unbound */
+    ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val,
+                        &interdomain->local_port);
+    if (ret) {
+        goto out;
+    }
+
+    if (interdomain->remote_dom == DOMID_QEMU) {
+        /* We haven't hooked up QEMU's PV drivers to this yet */
+        ret = -ENOSYS;
+    } else {
+        /* Loopback */
+        XenEvtchnPort *rp = &s->port_table[interdomain->remote_port];
+        XenEvtchnPort *lp = &s->port_table[interdomain->local_port];
+
+        if (rp->type == EVTCHNSTAT_unbound && rp->type_val == 0) {
+            /* It's a match! */
+            rp->type = EVTCHNSTAT_interdomain;
+            rp->type_val = interdomain->local_port;
+
+            lp->type = EVTCHNSTAT_interdomain;
+            lp->type_val = interdomain->remote_port;
+        } else {
+            ret = -EINVAL;
+        }
+    }
+
+    if (ret) {
+        free_port(s, interdomain->local_port);
+    }
+ out:
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+
+}
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index fc08013..1ebc758 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -22,6 +22,7 @@ struct evtchn_bind_virq;
 struct evtchn_bind_ipi;
 struct evtchn_send;
 struct evtchn_alloc_unbound;
+struct evtchn_bind_interdomain;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
@@ -29,5 +30,6 @@ int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
 int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
 int xen_evtchn_send_op(struct evtchn_send *send);
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
+int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain);
 
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From 306670461bb19e7af42b3d68d007d7690efd2334 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 14 Dec 2022 19:27:38 +0000
Subject: hw/xen: Implement EVTCHNOP_bind_vcpu

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 40 ++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  2 ++
 2 files changed, 42 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index e2e36d9..cbaf4f5 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -791,6 +791,46 @@ int xen_evtchn_unmask_op(struct evtchn_unmask *unmask)
     return ret;
 }
 
+int xen_evtchn_bind_vcpu_op(struct evtchn_bind_vcpu *vcpu)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    XenEvtchnPort *p;
+    int ret = -EINVAL;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (!valid_port(vcpu->port)) {
+        return -EINVAL;
+    }
+
+    if (!valid_vcpu(vcpu->vcpu)) {
+        return -ENOENT;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    p = &s->port_table[vcpu->port];
+
+    if (p->type == EVTCHNSTAT_interdomain ||
+        p->type == EVTCHNSTAT_unbound ||
+        p->type == EVTCHNSTAT_pirq ||
+        (p->type == EVTCHNSTAT_virq && virq_is_global(p->type_val))) {
+        /*
+         * unmask_port() with do_unmask==false will just raise the event
+         * on the new vCPU if the port was already pending.
+         */
+        p->vcpu = vcpu->vcpu;
+        unmask_port(s, vcpu->port, false);
+        ret = 0;
+    }
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
+
 int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 1ebc758..486b031 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -23,6 +23,7 @@ struct evtchn_bind_ipi;
 struct evtchn_send;
 struct evtchn_alloc_unbound;
 struct evtchn_bind_interdomain;
+struct evtchn_bind_vcpu;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
@@ -31,5 +32,6 @@ int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
 int xen_evtchn_send_op(struct evtchn_send *send);
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
 int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain);
+int xen_evtchn_bind_vcpu_op(struct evtchn_bind_vcpu *vcpu);
 
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From a15b10978fe62411936e577288f28184a41754a9 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 14 Dec 2022 19:36:15 +0000
Subject: hw/xen: Implement EVTCHNOP_reset

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 30 ++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  3 +++
 2 files changed, 33 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index cbaf4f5..c804a31 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -12,6 +12,7 @@
 #include "qemu/osdep.h"
 #include "qemu/host-utils.h"
 #include "qemu/module.h"
+#include "qemu/lockable.h"
 #include "qemu/main-loop.h"
 #include "qemu/log.h"
 #include "qapi/error.h"
@@ -747,6 +748,35 @@ static int close_port(XenEvtchnState *s, evtchn_port_t port)
     return 0;
 }
 
+int xen_evtchn_soft_reset(void)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int i;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    assert(qemu_mutex_iothread_locked());
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    for (i = 0; i < s->nr_ports; i++) {
+        close_port(s, i);
+    }
+
+    return 0;
+}
+
+int xen_evtchn_reset_op(struct evtchn_reset *reset)
+{
+    if (reset->dom != DOMID_SELF && reset->dom != xen_domid) {
+        return -ESRCH;
+    }
+
+    return xen_evtchn_soft_reset();
+}
+
 int xen_evtchn_close_op(struct evtchn_close *close)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 486b031..5d3e035 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -13,6 +13,7 @@
 #define QEMU_XEN_EVTCHN_H
 
 void xen_evtchn_create(void);
+int xen_evtchn_soft_reset(void);
 int xen_evtchn_set_callback_param(uint64_t param);
 
 struct evtchn_status;
@@ -24,6 +25,7 @@ struct evtchn_send;
 struct evtchn_alloc_unbound;
 struct evtchn_bind_interdomain;
 struct evtchn_bind_vcpu;
+struct evtchn_reset;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
@@ -33,5 +35,6 @@ int xen_evtchn_send_op(struct evtchn_send *send);
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
 int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain);
 int xen_evtchn_bind_vcpu_op(struct evtchn_bind_vcpu *vcpu);
+int xen_evtchn_reset_op(struct evtchn_reset *reset);
 
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From 507cb64d6e66d672bfddb275fe746241e0ed8db2 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Tue, 21 Aug 2018 12:16:19 -0400
Subject: i386/xen: add monitor commands to test event injection

Specifically add listing, injection of event channels.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/meson.build  |   7 +++
 hw/i386/kvm/xen-stubs.c  |  28 ++++++++++
 hw/i386/kvm/xen_evtchn.c | 137 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 172 insertions(+)
 create mode 100644 hw/i386/kvm/xen-stubs.c

(limited to 'hw')

diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index cab64df..325c901 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -10,3 +10,10 @@ i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
   ))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
+
+xen_stubs_ss = ss.source_set()
+xen_stubs_ss.add(when: 'CONFIG_XEN_EMU', if_false: files(
+  'xen-stubs.c',
+))
+
+specific_ss.add_all(when: 'CONFIG_SOFTMMU', if_true: xen_stubs_ss)
diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c
new file mode 100644
index 0000000..720590a
--- /dev/null
+++ b/hw/i386/kvm/xen-stubs.c
@@ -0,0 +1,28 @@
+/*
+ * QEMU Xen emulation: QMP stubs
+ *
+ * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc-target.h"
+
+#ifdef TARGET_I386
+EvtchnInfoList *qmp_xen_event_list(Error **errp)
+{
+    error_setg(errp, "Xen event channel emulation not enabled");
+    return NULL;
+}
+
+void qmp_xen_event_inject(uint32_t port, Error **errp)
+{
+    error_setg(errp, "Xen event channel emulation not enabled");
+}
+#endif
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index c804a31..e937de7 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -15,7 +15,11 @@
 #include "qemu/lockable.h"
 #include "qemu/main-loop.h"
 #include "qemu/log.h"
+#include "monitor/monitor.h"
+#include "monitor/hmp.h"
 #include "qapi/error.h"
+#include "qapi/qapi-commands-misc-target.h"
+#include "qapi/qmp/qdict.h"
 #include "qom/object.h"
 #include "exec/target_page.h"
 #include "exec/address-spaces.h"
@@ -1069,3 +1073,136 @@ int xen_evtchn_send_op(struct evtchn_send *send)
     return ret;
 }
 
+EvtchnInfoList *qmp_xen_event_list(Error **errp)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    EvtchnInfoList *head = NULL, **tail = &head;
+    void *shinfo, *pending, *mask;
+    int i;
+
+    if (!s) {
+        error_setg(errp, "Xen event channel emulation not enabled");
+        return NULL;
+    }
+
+    shinfo = xen_overlay_get_shinfo_ptr();
+    if (!shinfo) {
+        error_setg(errp, "Xen shared info page not allocated");
+        return NULL;
+    }
+
+    if (xen_is_long_mode()) {
+        pending = shinfo + offsetof(struct shared_info, evtchn_pending);
+        mask = shinfo + offsetof(struct shared_info, evtchn_mask);
+    } else {
+        pending = shinfo + offsetof(struct compat_shared_info, evtchn_pending);
+        mask = shinfo + offsetof(struct compat_shared_info, evtchn_mask);
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    for (i = 0; i < s->nr_ports; i++) {
+        XenEvtchnPort *p = &s->port_table[i];
+        EvtchnInfo *info;
+
+        if (p->type == EVTCHNSTAT_closed) {
+            continue;
+        }
+
+        info = g_new0(EvtchnInfo, 1);
+
+        info->port = i;
+        qemu_build_assert(EVTCHN_PORT_TYPE_CLOSED == EVTCHNSTAT_closed);
+        qemu_build_assert(EVTCHN_PORT_TYPE_UNBOUND == EVTCHNSTAT_unbound);
+        qemu_build_assert(EVTCHN_PORT_TYPE_INTERDOMAIN == EVTCHNSTAT_interdomain);
+        qemu_build_assert(EVTCHN_PORT_TYPE_PIRQ == EVTCHNSTAT_pirq);
+        qemu_build_assert(EVTCHN_PORT_TYPE_VIRQ == EVTCHNSTAT_virq);
+        qemu_build_assert(EVTCHN_PORT_TYPE_IPI == EVTCHNSTAT_ipi);
+
+        info->type = p->type;
+        if (p->type == EVTCHNSTAT_interdomain) {
+            info->remote_domain = g_strdup((p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) ?
+                                           "qemu" : "loopback");
+            info->target = p->type_val & PORT_INFO_TYPEVAL_REMOTE_PORT_MASK;
+        } else {
+            info->target = p->type_val;
+        }
+        info->vcpu = p->vcpu;
+        info->pending = test_bit(i, pending);
+        info->masked = test_bit(i, mask);
+
+        QAPI_LIST_APPEND(tail, info);
+    }
+
+    return head;
+}
+
+void qmp_xen_event_inject(uint32_t port, Error **errp)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+
+    if (!s) {
+        error_setg(errp, "Xen event channel emulation not enabled");
+        return;
+    }
+
+    if (!valid_port(port)) {
+        error_setg(errp, "Invalid port %u", port);
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    if (set_port_pending(s, port)) {
+        error_setg(errp, "Failed to set port %u", port);
+        return;
+    }
+}
+
+void hmp_xen_event_list(Monitor *mon, const QDict *qdict)
+{
+    EvtchnInfoList *iter, *info_list;
+    Error *err = NULL;
+
+    info_list = qmp_xen_event_list(&err);
+    if (err) {
+        hmp_handle_error(mon, err);
+        return;
+    }
+
+    for (iter = info_list; iter; iter = iter->next) {
+        EvtchnInfo *info = iter->value;
+
+        monitor_printf(mon, "port %4u: vcpu: %d %s", info->port, info->vcpu,
+                       EvtchnPortType_str(info->type));
+        if (info->type != EVTCHN_PORT_TYPE_IPI) {
+            monitor_printf(mon,  "(");
+            if (info->remote_domain) {
+                monitor_printf(mon, "%s:", info->remote_domain);
+            }
+            monitor_printf(mon, "%d)", info->target);
+        }
+        if (info->pending) {
+            monitor_printf(mon, " PENDING");
+        }
+        if (info->masked) {
+            monitor_printf(mon, " MASKED");
+        }
+        monitor_printf(mon, "\n");
+    }
+
+    qapi_free_EvtchnInfoList(info_list);
+}
+
+void hmp_xen_event_inject(Monitor *mon, const QDict *qdict)
+{
+    int port = qdict_get_int(qdict, "port");
+    Error *err = NULL;
+
+    qmp_xen_event_inject(port, &err);
+    if (err) {
+        hmp_handle_error(mon, err);
+    } else {
+        monitor_printf(mon, "Delivered port %d\n", port);
+    }
+}
+
-- 
cgit v1.1


From ddf0fd9ae1fd1ff95489763b37a483adb3cd5907 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 15 Dec 2022 20:35:24 +0000
Subject: hw/xen: Support HVM_PARAM_CALLBACK_TYPE_GSI callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The GSI callback (and later PCI_INTX) is a level triggered interrupt. It
is asserted when an event channel is delivered to vCPU0, and is supposed
to be cleared when the vcpu_info->evtchn_upcall_pending field for vCPU0
is cleared again.

Thankfully, Xen does *not* assert the GSI if the guest sets its own
evtchn_upcall_pending field; we only need to assert the GSI when we
have delivered an event for ourselves. So that's the easy part, kind of.

There's a slight complexity in that we need to hold the BQL before we
can call qemu_set_irq(), and we definitely can't do that while holding
our own port_lock (because we'll need to take that from the qemu-side
functions that the PV backend drivers will call). So if we end up
wanting to set the IRQ in a context where we *don't* already hold the
BQL, defer to a BH.

However, we *do* need to poll for the evtchn_upcall_pending flag being
cleared. In an ideal world we would poll that when the EOI happens on
the PIC/IOAPIC. That's how it works in the kernel with the VFIO eventfd
pairs — one is used to trigger the interrupt, and the other works in the
other direction to 'resample' on EOI, and trigger the first eventfd
again if the line is still active.

However, QEMU doesn't seem to do that. Even VFIO level interrupts seem
to be supported by temporarily unmapping the device's BARs from the
guest when an interrupt happens, then trapping *all* MMIO to the device
and sending the 'resample' event on *every* MMIO access until the IRQ
is cleared! Maybe in future we'll plumb the 'resample' concept through
QEMU's irq framework but for now we'll do what Xen itself does: just
check the flag on every vmexit if the upcall GSI is known to be
asserted.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  4 ++
 hw/i386/pc.c             |  6 +++
 3 files changed, 107 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index e937de7..6b0bdba 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -27,6 +27,8 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/i386/x86.h"
+#include "hw/irq.h"
 
 #include "xen_evtchn.h"
 #include "xen_overlay.h"
@@ -100,9 +102,12 @@ struct XenEvtchnState {
     uint64_t callback_param;
     bool evtchn_in_kernel;
 
+    QEMUBH *gsi_bh;
+
     QemuMutex port_lock;
     uint32_t nr_ports;
     XenEvtchnPort port_table[EVTCHN_2L_NR_CHANNELS];
+    qemu_irq gsis[IOAPIC_NUM_PINS];
 };
 
 struct XenEvtchnState *xen_evtchn_singleton;
@@ -167,13 +172,42 @@ static const TypeInfo xen_evtchn_info = {
     .class_init    = xen_evtchn_class_init,
 };
 
+static void gsi_assert_bh(void *opaque)
+{
+    struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
+    if (vi) {
+        xen_evtchn_set_callback_level(!!vi->evtchn_upcall_pending);
+    }
+}
+
 void xen_evtchn_create(void)
 {
     XenEvtchnState *s = XEN_EVTCHN(sysbus_create_simple(TYPE_XEN_EVTCHN,
                                                         -1, NULL));
+    int i;
+
     xen_evtchn_singleton = s;
 
     qemu_mutex_init(&s->port_lock);
+    s->gsi_bh = aio_bh_new(qemu_get_aio_context(), gsi_assert_bh, s);
+
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        sysbus_init_irq(SYS_BUS_DEVICE(s), &s->gsis[i]);
+    }
+}
+
+void xen_evtchn_connect_gsis(qemu_irq *system_gsis)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int i;
+
+    if (!s) {
+        return;
+    }
+
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        sysbus_connect_irq(SYS_BUS_DEVICE(s), i, system_gsis[i]);
+    }
 }
 
 static void xen_evtchn_register_types(void)
@@ -183,6 +217,64 @@ static void xen_evtchn_register_types(void)
 
 type_init(xen_evtchn_register_types)
 
+void xen_evtchn_set_callback_level(int level)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    uint32_t param;
+
+    if (!s) {
+        return;
+    }
+
+    /*
+     * We get to this function in a number of ways:
+     *
+     *  • From I/O context, via PV backend drivers sending a notification to
+     *    the guest.
+     *
+     *  • From guest vCPU context, via loopback interdomain event channels
+     *    (or theoretically even IPIs but guests don't use those with GSI
+     *    delivery because that's pointless. We don't want a malicious guest
+     *    to be able to trigger a deadlock though, so we can't rule it out.)
+     *
+     *  • From guest vCPU context when the HVM_PARAM_CALLBACK_IRQ is being
+     *    configured.
+     *
+     *  • From guest vCPU context in the KVM exit handler, if the upcall
+     *    pending flag has been cleared and the GSI needs to be deasserted.
+     *
+     *  • Maybe in future, in an interrupt ack/eoi notifier when the GSI has
+     *    been acked in the irqchip.
+     *
+     * Whichever context we come from if we aren't already holding the BQL
+     * then e can't take it now, as we may already hold s->port_lock. So
+     * trigger the BH to set the IRQ for us instead of doing it immediately.
+     *
+     * In the HVM_PARAM_CALLBACK_IRQ and KVM exit handler cases, the caller
+     * will deliberately take the BQL because they want the change to take
+     * effect immediately. That just leaves interdomain loopback as the case
+     * which uses the BH.
+     */
+    if (!qemu_mutex_iothread_locked()) {
+        qemu_bh_schedule(s->gsi_bh);
+        return;
+    }
+
+    param = (uint32_t)s->callback_param;
+
+    switch (s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) {
+    case HVM_PARAM_CALLBACK_TYPE_GSI:
+        if (param < IOAPIC_NUM_PINS) {
+            qemu_set_irq(s->gsis[param], level);
+            if (level) {
+                /* Ensure the vCPU polls for deassertion */
+                kvm_xen_set_callback_asserted();
+            }
+        }
+        break;
+    }
+}
+
 int xen_evtchn_set_callback_param(uint64_t param)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
@@ -209,6 +301,11 @@ int xen_evtchn_set_callback_param(uint64_t param)
         }
         break;
     }
+
+    case HVM_PARAM_CALLBACK_TYPE_GSI:
+        ret = 0;
+        break;
+
     default:
         /* Xen doesn't return error even if you set something bogus */
         ret = 0;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 5d3e035..b03c310 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -12,9 +12,13 @@
 #ifndef QEMU_XEN_EVTCHN_H
 #define QEMU_XEN_EVTCHN_H
 
+#include "hw/sysbus.h"
+
 void xen_evtchn_create(void);
 int xen_evtchn_soft_reset(void);
 int xen_evtchn_set_callback_param(uint64_t param);
+void xen_evtchn_connect_gsis(qemu_irq *system_gsis);
+void xen_evtchn_set_callback_level(int level);
 
 struct evtchn_status;
 struct evtchn_close;
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f38e3f1..71d3b8b 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1310,6 +1310,12 @@ void pc_basic_device_init(struct PCMachineState *pcms,
     }
     *rtc_state = ISA_DEVICE(mc146818_rtc_init(isa_bus, 2000, rtc_irq));
 
+#ifdef CONFIG_XEN_EMU
+    if (xen_mode == XEN_EMULATE) {
+        xen_evtchn_connect_gsis(gsi);
+    }
+#endif
+
     qemu_register_boot_set(pc_boot_set, *rtc_state);
 
     if (!xen_enabled() &&
-- 
cgit v1.1


From 2aff696b10d16ef09dc5a2c953ceccbf6d38f744 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 16 Dec 2022 00:03:21 +0000
Subject: hw/xen: Support HVM_PARAM_CALLBACK_TYPE_PCI_INTX callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The guest is permitted to specify an arbitrary domain/bus/device/function
and INTX pin from which the callback IRQ shall appear to have come.

In QEMU we can only easily do this for devices that actually exist, and
even that requires us "knowing" that it's a PCMachine in order to find
the PCI root bus — although that's OK really because it's always true.

We also don't get to get notified of INTX routing changes, because we
can't do that as a passive observer; if we try to register a notifier
it will overwrite any existing notifier callback on the device.

But in practice, guests using PCI_INTX will only ever use pin A on the
Xen platform device, and won't swizzle the INTX routing after they set
it up. So this is just fine.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 80 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 66 insertions(+), 14 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 6b0bdba..f39b751 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -28,6 +28,8 @@
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
 #include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/pci/pci.h"
 #include "hw/irq.h"
 
 #include "xen_evtchn.h"
@@ -101,6 +103,7 @@ struct XenEvtchnState {
 
     uint64_t callback_param;
     bool evtchn_in_kernel;
+    uint32_t callback_gsi;
 
     QEMUBH *gsi_bh;
 
@@ -217,11 +220,41 @@ static void xen_evtchn_register_types(void)
 
 type_init(xen_evtchn_register_types)
 
+static int set_callback_pci_intx(XenEvtchnState *s, uint64_t param)
+{
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+    uint8_t pin = param & 3;
+    uint8_t devfn = (param >> 8) & 0xff;
+    uint16_t bus = (param >> 16) & 0xffff;
+    uint16_t domain = (param >> 32) & 0xffff;
+    PCIDevice *pdev;
+    PCIINTxRoute r;
+
+    if (domain || !pcms) {
+        return 0;
+    }
+
+    pdev = pci_find_device(pcms->bus, bus, devfn);
+    if (!pdev) {
+        return 0;
+    }
+
+    r = pci_device_route_intx_to_irq(pdev, pin);
+    if (r.mode != PCI_INTX_ENABLED) {
+        return 0;
+    }
+
+    /*
+     * Hm, can we be notified of INTX routing changes? Not without
+     * *owning* the device and being allowed to overwrite its own
+     * ->intx_routing_notifier, AFAICT. So let's not.
+     */
+    return r.irq;
+}
+
 void xen_evtchn_set_callback_level(int level)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
-    uint32_t param;
-
     if (!s) {
         return;
     }
@@ -260,18 +293,12 @@ void xen_evtchn_set_callback_level(int level)
         return;
     }
 
-    param = (uint32_t)s->callback_param;
-
-    switch (s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) {
-    case HVM_PARAM_CALLBACK_TYPE_GSI:
-        if (param < IOAPIC_NUM_PINS) {
-            qemu_set_irq(s->gsis[param], level);
-            if (level) {
-                /* Ensure the vCPU polls for deassertion */
-                kvm_xen_set_callback_asserted();
-            }
+    if (s->callback_gsi && s->callback_gsi < IOAPIC_NUM_PINS) {
+        qemu_set_irq(s->gsis[s->callback_gsi], level);
+        if (level) {
+            /* Ensure the vCPU polls for deassertion */
+            kvm_xen_set_callback_asserted();
         }
-        break;
     }
 }
 
@@ -283,15 +310,22 @@ int xen_evtchn_set_callback_param(uint64_t param)
         .u.vector = 0,
     };
     bool in_kernel = false;
+    uint32_t gsi = 0;
+    int type = param >> CALLBACK_VIA_TYPE_SHIFT;
     int ret;
 
     if (!s) {
         return -ENOTSUP;
     }
 
+    /*
+     * We need the BQL because set_callback_pci_intx() may call into PCI code,
+     * and because we may need to manipulate the old and new GSI levels.
+     */
+    assert(qemu_mutex_iothread_locked());
     qemu_mutex_lock(&s->port_lock);
 
-    switch (param >> CALLBACK_VIA_TYPE_SHIFT) {
+    switch (type) {
     case HVM_PARAM_CALLBACK_TYPE_VECTOR: {
         xa.u.vector = (uint8_t)param,
 
@@ -299,10 +333,17 @@ int xen_evtchn_set_callback_param(uint64_t param)
         if (!ret && kvm_xen_has_cap(EVTCHN_SEND)) {
             in_kernel = true;
         }
+        gsi = 0;
         break;
     }
 
+    case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
+        gsi = set_callback_pci_intx(s, param);
+        ret = gsi ? 0 : -EINVAL;
+        break;
+
     case HVM_PARAM_CALLBACK_TYPE_GSI:
+        gsi = (uint32_t)param;
         ret = 0;
         break;
 
@@ -320,6 +361,17 @@ int xen_evtchn_set_callback_param(uint64_t param)
         }
         s->callback_param = param;
         s->evtchn_in_kernel = in_kernel;
+
+        if (gsi != s->callback_gsi) {
+            struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
+
+            xen_evtchn_set_callback_level(0);
+            s->callback_gsi = gsi;
+
+            if (gsi && vi && vi->evtchn_upcall_pending) {
+                kvm_xen_inject_vcpu_callback_vector(0, type);
+            }
+        }
     }
 
     qemu_mutex_unlock(&s->port_lock);
-- 
cgit v1.1


From a28b0fc0345b1ddc12109de807e6d4ce566c1914 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 16 Dec 2022 15:50:26 +0000
Subject: hw/xen: Add xen_gnttab device for grant table emulation

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/meson.build  |   1 +
 hw/i386/kvm/xen_gnttab.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_gnttab.h |  18 ++++++++
 hw/i386/pc.c             |   2 +
 4 files changed, 132 insertions(+)
 create mode 100644 hw/i386/kvm/xen_gnttab.c
 create mode 100644 hw/i386/kvm/xen_gnttab.h

(limited to 'hw')

diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 325c901..067ff52 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -7,6 +7,7 @@ i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
 i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
   'xen_overlay.c',
   'xen_evtchn.c',
+  'xen_gnttab.c',
   ))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
new file mode 100644
index 0000000..ef8857e
--- /dev/null
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -0,0 +1,111 @@
+/*
+ * QEMU Xen emulation: Grant table support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/lockable.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_overlay.h"
+#include "xen_gnttab.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+
+#include "hw/xen/interface/memory.h"
+#include "hw/xen/interface/grant_table.h"
+
+#define TYPE_XEN_GNTTAB "xen-gnttab"
+OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB)
+
+#define XEN_PAGE_SHIFT 12
+#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
+
+struct XenGnttabState {
+    /*< private >*/
+    SysBusDevice busdev;
+    /*< public >*/
+
+    uint32_t nr_frames;
+    uint32_t max_frames;
+};
+
+struct XenGnttabState *xen_gnttab_singleton;
+
+static void xen_gnttab_realize(DeviceState *dev, Error **errp)
+{
+    XenGnttabState *s = XEN_GNTTAB(dev);
+
+    if (xen_mode != XEN_EMULATE) {
+        error_setg(errp, "Xen grant table support is for Xen emulation");
+        return;
+    }
+    s->nr_frames = 0;
+    s->max_frames = kvm_xen_get_gnttab_max_frames();
+}
+
+static bool xen_gnttab_is_needed(void *opaque)
+{
+    return xen_mode == XEN_EMULATE;
+}
+
+static const VMStateDescription xen_gnttab_vmstate = {
+    .name = "xen_gnttab",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = xen_gnttab_is_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(nr_frames, XenGnttabState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void xen_gnttab_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = xen_gnttab_realize;
+    dc->vmsd = &xen_gnttab_vmstate;
+}
+
+static const TypeInfo xen_gnttab_info = {
+    .name          = TYPE_XEN_GNTTAB,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(XenGnttabState),
+    .class_init    = xen_gnttab_class_init,
+};
+
+void xen_gnttab_create(void)
+{
+    xen_gnttab_singleton = XEN_GNTTAB(sysbus_create_simple(TYPE_XEN_GNTTAB,
+                                                           -1, NULL));
+}
+
+static void xen_gnttab_register_types(void)
+{
+    type_register_static(&xen_gnttab_info);
+}
+
+type_init(xen_gnttab_register_types)
+
+int xen_gnttab_map_page(uint64_t idx, uint64_t gfn)
+{
+    return -ENOSYS;
+}
+
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
new file mode 100644
index 0000000..a7caa94
--- /dev/null
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -0,0 +1,18 @@
+/*
+ * QEMU Xen emulation: Grant table support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XEN_GNTTAB_H
+#define QEMU_XEN_GNTTAB_H
+
+void xen_gnttab_create(void);
+int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
+
+#endif /* QEMU_XEN_GNTTAB_H */
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 71d3b8b..1cf112e 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -92,6 +92,7 @@
 #include "hw/virtio/virtio-mem-pci.h"
 #include "hw/i386/kvm/xen_overlay.h"
 #include "hw/i386/kvm/xen_evtchn.h"
+#include "hw/i386/kvm/xen_gnttab.h"
 #include "hw/mem/memory-device.h"
 #include "sysemu/replay.h"
 #include "target/i386/cpu.h"
@@ -1860,6 +1861,7 @@ int pc_machine_kvm_type(MachineState *machine, const char *kvm_type)
     if (xen_mode == XEN_EMULATE) {
         xen_overlay_create();
         xen_evtchn_create();
+        xen_gnttab_create();
     }
 #endif
     return 0;
-- 
cgit v1.1


From e33cb789afd3aedbe181b78fd75d04a5c48f7cba Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 16 Dec 2022 18:33:49 +0000
Subject: hw/xen: Support mapping grant frames

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_gnttab.c  | 73 ++++++++++++++++++++++++++++++++++++++++++++++-
 hw/i386/kvm/xen_overlay.c |  2 +-
 hw/i386/kvm/xen_overlay.h |  2 ++
 3 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index ef8857e..72e87ae 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -37,13 +37,26 @@ OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB)
 #define XEN_PAGE_SHIFT 12
 #define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
 
+#define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
+
 struct XenGnttabState {
     /*< private >*/
     SysBusDevice busdev;
     /*< public >*/
 
+    QemuMutex gnt_lock;
+
     uint32_t nr_frames;
     uint32_t max_frames;
+
+    union {
+        grant_entry_v1_t *v1;
+        /* Theoretically, v2 support could be added here. */
+    } entries;
+
+    MemoryRegion gnt_frames;
+    MemoryRegion *gnt_aliases;
+    uint64_t *gnt_frame_gpas;
 };
 
 struct XenGnttabState *xen_gnttab_singleton;
@@ -51,6 +64,7 @@ struct XenGnttabState *xen_gnttab_singleton;
 static void xen_gnttab_realize(DeviceState *dev, Error **errp)
 {
     XenGnttabState *s = XEN_GNTTAB(dev);
+    int i;
 
     if (xen_mode != XEN_EMULATE) {
         error_setg(errp, "Xen grant table support is for Xen emulation");
@@ -58,6 +72,38 @@ static void xen_gnttab_realize(DeviceState *dev, Error **errp)
     }
     s->nr_frames = 0;
     s->max_frames = kvm_xen_get_gnttab_max_frames();
+    memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table",
+                           XEN_PAGE_SIZE * s->max_frames, &error_abort);
+    memory_region_set_enabled(&s->gnt_frames, true);
+    s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames);
+    memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
+
+    /* Create individual page-sizes aliases for overlays */
+    s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames);
+    s->gnt_frame_gpas = (void *)g_new(uint64_t, s->max_frames);
+    for (i = 0; i < s->max_frames; i++) {
+        memory_region_init_alias(&s->gnt_aliases[i], OBJECT(dev),
+                                 NULL, &s->gnt_frames,
+                                 i * XEN_PAGE_SIZE, XEN_PAGE_SIZE);
+        s->gnt_frame_gpas[i] = INVALID_GPA;
+    }
+
+    qemu_mutex_init(&s->gnt_lock);
+
+    xen_gnttab_singleton = s;
+}
+
+static int xen_gnttab_post_load(void *opaque, int version_id)
+{
+    XenGnttabState *s = XEN_GNTTAB(opaque);
+    uint32_t i;
+
+    for (i = 0; i < s->nr_frames; i++) {
+        if (s->gnt_frame_gpas[i] != INVALID_GPA) {
+            xen_overlay_do_map_page(&s->gnt_aliases[i], s->gnt_frame_gpas[i]);
+        }
+    }
+    return 0;
 }
 
 static bool xen_gnttab_is_needed(void *opaque)
@@ -70,8 +116,11 @@ static const VMStateDescription xen_gnttab_vmstate = {
     .version_id = 1,
     .minimum_version_id = 1,
     .needed = xen_gnttab_is_needed,
+    .post_load = xen_gnttab_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT32(nr_frames, XenGnttabState),
+        VMSTATE_VARRAY_UINT32(gnt_frame_gpas, XenGnttabState, nr_frames, 0,
+                              vmstate_info_uint64, uint64_t),
         VMSTATE_END_OF_LIST()
     }
 };
@@ -106,6 +155,28 @@ type_init(xen_gnttab_register_types)
 
 int xen_gnttab_map_page(uint64_t idx, uint64_t gfn)
 {
-    return -ENOSYS;
+    XenGnttabState *s = xen_gnttab_singleton;
+    uint64_t gpa = gfn << XEN_PAGE_SHIFT;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (idx >= s->max_frames) {
+        return -EINVAL;
+    }
+
+    QEMU_IOTHREAD_LOCK_GUARD();
+    QEMU_LOCK_GUARD(&s->gnt_lock);
+
+    xen_overlay_do_map_page(&s->gnt_aliases[idx], gpa);
+
+    s->gnt_frame_gpas[idx] = gpa;
+
+    if (s->nr_frames <= idx) {
+        s->nr_frames = idx + 1;
+    }
+
+    return 0;
 }
 
diff --git a/hw/i386/kvm/xen_overlay.c b/hw/i386/kvm/xen_overlay.c
index 8685d87..39fda1b 100644
--- a/hw/i386/kvm/xen_overlay.c
+++ b/hw/i386/kvm/xen_overlay.c
@@ -49,7 +49,7 @@ struct XenOverlayState {
 
 struct XenOverlayState *xen_overlay_singleton;
 
-static void xen_overlay_do_map_page(MemoryRegion *page, uint64_t gpa)
+void xen_overlay_do_map_page(MemoryRegion *page, uint64_t gpa)
 {
     /*
      * Xen allows guests to map the same page as many times as it likes
diff --git a/hw/i386/kvm/xen_overlay.h b/hw/i386/kvm/xen_overlay.h
index 5c46a0b..75ecb6b 100644
--- a/hw/i386/kvm/xen_overlay.h
+++ b/hw/i386/kvm/xen_overlay.h
@@ -21,4 +21,6 @@ int xen_sync_long_mode(void);
 int xen_set_long_mode(bool long_mode);
 bool xen_is_long_mode(void);
 
+void xen_overlay_do_map_page(MemoryRegion *page, uint64_t gpa);
+
 #endif /* QEMU_XEN_OVERLAY_H */
-- 
cgit v1.1


From 28b7ae94a21d37f38f7b68a9f40fa521144898a6 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 16 Dec 2022 23:40:57 +0000
Subject: i386/xen: Implement HYPERVISOR_grant_table_op and
 GNTTABOP_[gs]et_verson

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_gnttab.c | 31 +++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_gnttab.h |  5 +++++
 2 files changed, 36 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index 72e87ae..b54a94e 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -180,3 +180,34 @@ int xen_gnttab_map_page(uint64_t idx, uint64_t gfn)
     return 0;
 }
 
+int xen_gnttab_set_version_op(struct gnttab_set_version *set)
+{
+    int ret;
+
+    switch (set->version) {
+    case 1:
+        ret = 0;
+        break;
+
+    case 2:
+        /* Behave as before set_version was introduced. */
+        ret = -ENOSYS;
+        break;
+
+    default:
+        ret = -EINVAL;
+    }
+
+    set->version = 1;
+    return ret;
+}
+
+int xen_gnttab_get_version_op(struct gnttab_get_version *get)
+{
+    if (get->dom != DOMID_SELF && get->dom != xen_domid) {
+        return -ESRCH;
+    }
+
+    get->version = 1;
+    return 0;
+}
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
index a7caa94..7957967 100644
--- a/hw/i386/kvm/xen_gnttab.h
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -15,4 +15,9 @@
 void xen_gnttab_create(void);
 int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
 
+struct gnttab_set_version;
+struct gnttab_get_version;
+int xen_gnttab_set_version_op(struct gnttab_set_version *set);
+int xen_gnttab_get_version_op(struct gnttab_get_version *get);
+
 #endif /* QEMU_XEN_GNTTAB_H */
-- 
cgit v1.1


From b46f9745b1b5c8dd6ea1bd1361531f966c404f8c Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 16 Dec 2022 23:49:48 +0000
Subject: hw/xen: Implement GNTTABOP_query_size

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_gnttab.c | 19 +++++++++++++++++++
 hw/i386/kvm/xen_gnttab.h |  2 ++
 2 files changed, 21 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index b54a94e..1e691de 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -211,3 +211,22 @@ int xen_gnttab_get_version_op(struct gnttab_get_version *get)
     get->version = 1;
     return 0;
 }
+
+int xen_gnttab_query_size_op(struct gnttab_query_size *size)
+{
+    XenGnttabState *s = xen_gnttab_singleton;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (size->dom != DOMID_SELF && size->dom != xen_domid) {
+        size->status = GNTST_bad_domain;
+        return 0;
+    }
+
+    size->status = GNTST_okay;
+    size->nr_frames = s->nr_frames;
+    size->max_nr_frames = s->max_frames;
+    return 0;
+}
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
index 7957967..3bdbe96 100644
--- a/hw/i386/kvm/xen_gnttab.h
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -17,7 +17,9 @@ int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
 
 struct gnttab_set_version;
 struct gnttab_get_version;
+struct gnttab_query_size;
 int xen_gnttab_set_version_op(struct gnttab_set_version *set);
 int xen_gnttab_get_version_op(struct gnttab_get_version *get);
+int xen_gnttab_query_size_op(struct gnttab_query_size *size);
 
 #endif /* QEMU_XEN_GNTTAB_H */
-- 
cgit v1.1


From b746a77926f6e84bdb35a38a9ee956ac12693757 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Mon, 17 Sep 2018 07:04:54 -0400
Subject: i386/xen: handle PV timer hypercalls

Introduce support for one shot and periodic mode of Xen PV timers,
whereby timer interrupts come through a special virq event channel
with deadlines being set through:

1) set_timer_op hypercall (only oneshot)
2) vcpu_op hypercall for {set,stop}_{singleshot,periodic}_timer
hypercalls

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 31 +++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  2 ++
 2 files changed, 33 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index f39b751..71ed480 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -1222,6 +1222,37 @@ int xen_evtchn_send_op(struct evtchn_send *send)
     return ret;
 }
 
+int xen_evtchn_set_port(uint16_t port)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    XenEvtchnPort *p;
+    int ret = -EINVAL;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (!valid_port(port)) {
+        return -EINVAL;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    p = &s->port_table[port];
+
+    /* QEMU has no business sending to anything but these */
+    if (p->type == EVTCHNSTAT_virq ||
+        (p->type == EVTCHNSTAT_interdomain &&
+         (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU))) {
+        set_port_pending(s, port);
+        ret = 0;
+    }
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
+
 EvtchnInfoList *qmp_xen_event_list(Error **errp)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index b03c310..2461147 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -20,6 +20,8 @@ int xen_evtchn_set_callback_param(uint64_t param);
 void xen_evtchn_connect_gsis(qemu_irq *system_gsis);
 void xen_evtchn_set_callback_level(int level);
 
+int xen_evtchn_set_port(uint16_t port);
+
 struct evtchn_status;
 struct evtchn_close;
 struct evtchn_unmask;
-- 
cgit v1.1


From 794fba23a53ac714589f84f868202d0cfcb41cd2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 27 Dec 2022 21:20:07 +0000
Subject: hw/xen: Add backend implementation of interdomain event channel
 support

The provides the QEMU side of interdomain event channels, allowing events
to be sent to/from the guest.

The API mirrors libxenevtchn, and in time both this and the real Xen one
will be available through ops structures so that the PV backend drivers
can use the correct one as appropriate.

For now, this implementation can be used directly by our XenStore which
will be for emulated mode only.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 340 ++++++++++++++++++++++++++++++++++++++++++++++-
 hw/i386/kvm/xen_evtchn.h |  19 +++
 2 files changed, 352 insertions(+), 7 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 71ed480..601fbee 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -38,6 +38,7 @@
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_xen.h"
 #include <linux/kvm.h>
+#include <sys/eventfd.h>
 
 #include "hw/xen/interface/memory.h"
 #include "hw/xen/interface/hvm/params.h"
@@ -88,6 +89,13 @@ struct compat_shared_info {
 
 #define COMPAT_EVTCHN_2L_NR_CHANNELS            1024
 
+/* Local private implementation of struct xenevtchn_handle */
+struct xenevtchn_handle {
+    evtchn_port_t be_port;
+    evtchn_port_t guest_port; /* Or zero for unbound */
+    int fd;
+};
+
 /*
  * For unbound/interdomain ports there are only two possible remote
  * domains; self and QEMU. Use a single high bit in type_val for that,
@@ -111,6 +119,8 @@ struct XenEvtchnState {
     uint32_t nr_ports;
     XenEvtchnPort port_table[EVTCHN_2L_NR_CHANNELS];
     qemu_irq gsis[IOAPIC_NUM_PINS];
+
+    struct xenevtchn_handle *be_handles[EVTCHN_2L_NR_CHANNELS];
 };
 
 struct XenEvtchnState *xen_evtchn_singleton;
@@ -118,6 +128,18 @@ struct XenEvtchnState *xen_evtchn_singleton;
 /* Top bits of callback_param are the type (HVM_PARAM_CALLBACK_TYPE_xxx) */
 #define CALLBACK_VIA_TYPE_SHIFT 56
 
+static void unbind_backend_ports(XenEvtchnState *s);
+
+static int xen_evtchn_pre_load(void *opaque)
+{
+    XenEvtchnState *s = opaque;
+
+    /* Unbind all the backend-side ports; they need to rebind */
+    unbind_backend_ports(s);
+
+    return 0;
+}
+
 static int xen_evtchn_post_load(void *opaque, int version_id)
 {
     XenEvtchnState *s = opaque;
@@ -151,6 +173,7 @@ static const VMStateDescription xen_evtchn_vmstate = {
     .version_id = 1,
     .minimum_version_id = 1,
     .needed = xen_evtchn_is_needed,
+    .pre_load = xen_evtchn_pre_load,
     .post_load = xen_evtchn_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(callback_param, XenEvtchnState),
@@ -423,6 +446,20 @@ static int assign_kernel_port(uint16_t type, evtchn_port_t port,
     return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha);
 }
 
+static int assign_kernel_eventfd(uint16_t type, evtchn_port_t port, int fd)
+{
+    struct kvm_xen_hvm_attr ha;
+
+    ha.type = KVM_XEN_ATTR_TYPE_EVTCHN;
+    ha.u.evtchn.send_port = port;
+    ha.u.evtchn.type = type;
+    ha.u.evtchn.flags = 0;
+    ha.u.evtchn.deliver.eventfd.port = 0;
+    ha.u.evtchn.deliver.eventfd.fd = fd;
+
+    return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &ha);
+}
+
 static bool valid_port(evtchn_port_t port)
 {
     if (!port) {
@@ -441,6 +478,32 @@ static bool valid_vcpu(uint32_t vcpu)
     return !!qemu_get_cpu(vcpu);
 }
 
+static void unbind_backend_ports(XenEvtchnState *s)
+{
+    XenEvtchnPort *p;
+    int i;
+
+    for (i = 1; i < s->nr_ports; i++) {
+        p = &s->port_table[i];
+        if (p->type == EVTCHNSTAT_interdomain &&
+            (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU)) {
+            evtchn_port_t be_port = p->type_val & PORT_INFO_TYPEVAL_REMOTE_PORT_MASK;
+
+            if (s->be_handles[be_port]) {
+                /* This part will be overwritten on the load anyway. */
+                p->type = EVTCHNSTAT_unbound;
+                p->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+
+                /* Leave the backend port open and unbound too. */
+                if (kvm_xen_has_cap(EVTCHN_SEND)) {
+                    deassign_kernel_port(i);
+                }
+                s->be_handles[be_port]->guest_port = 0;
+            }
+        }
+    }
+}
+
 int xen_evtchn_status_op(struct evtchn_status *status)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
@@ -878,7 +941,14 @@ static int close_port(XenEvtchnState *s, evtchn_port_t port)
 
     case EVTCHNSTAT_interdomain:
         if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
-            /* Not yet implemented. This can't happen! */
+            uint16_t be_port = p->type_val & ~PORT_INFO_TYPEVAL_REMOTE_QEMU;
+            struct xenevtchn_handle *xc = s->be_handles[be_port];
+            if (xc) {
+                if (kvm_xen_has_cap(EVTCHN_SEND)) {
+                    deassign_kernel_port(port);
+                }
+                xc->guest_port = 0;
+            }
         } else {
             /* Loopback interdomain */
             XenEvtchnPort *rp = &s->port_table[p->type_val];
@@ -1110,8 +1180,27 @@ int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain)
     }
 
     if (interdomain->remote_dom == DOMID_QEMU) {
-        /* We haven't hooked up QEMU's PV drivers to this yet */
-        ret = -ENOSYS;
+        struct xenevtchn_handle *xc = s->be_handles[interdomain->remote_port];
+        XenEvtchnPort *lp = &s->port_table[interdomain->local_port];
+
+        if (!xc) {
+            ret = -ENOENT;
+            goto out_free_port;
+        }
+
+        if (xc->guest_port) {
+            ret = -EBUSY;
+            goto out_free_port;
+        }
+
+        assert(xc->be_port == interdomain->remote_port);
+        xc->guest_port = interdomain->local_port;
+        if (kvm_xen_has_cap(EVTCHN_SEND)) {
+            assign_kernel_eventfd(lp->type, xc->guest_port, xc->fd);
+        }
+        lp->type = EVTCHNSTAT_interdomain;
+        lp->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU | interdomain->remote_port;
+        ret = 0;
     } else {
         /* Loopback */
         XenEvtchnPort *rp = &s->port_table[interdomain->remote_port];
@@ -1129,6 +1218,7 @@ int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain)
         }
     }
 
+ out_free_port:
     if (ret) {
         free_port(s, interdomain->local_port);
     }
@@ -1193,11 +1283,16 @@ int xen_evtchn_send_op(struct evtchn_send *send)
         if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
             /*
              * This is an event from the guest to qemu itself, which is
-             * serving as the driver domain. Not yet implemented; it will
-             * be hooked up to the qemu implementation of xenstore,
-             * console, PV net/block drivers etc.
+             * serving as the driver domain.
              */
-            ret = -ENOSYS;
+            uint16_t be_port = p->type_val & ~PORT_INFO_TYPEVAL_REMOTE_QEMU;
+            struct xenevtchn_handle *xc = s->be_handles[be_port];
+            if (xc) {
+                eventfd_write(xc->fd, 1);
+                ret = 0;
+            } else {
+                ret = -ENOENT;
+            }
         } else {
             /* Loopback interdomain ports; just a complex IPI */
             set_port_pending(s, p->type_val);
@@ -1253,6 +1348,237 @@ int xen_evtchn_set_port(uint16_t port)
     return ret;
 }
 
+struct xenevtchn_handle *xen_be_evtchn_open(void)
+{
+    struct xenevtchn_handle *xc = g_new0(struct xenevtchn_handle, 1);
+
+    xc->fd = eventfd(0, EFD_CLOEXEC);
+    if (xc->fd < 0) {
+        free(xc);
+        return NULL;
+    }
+
+    return xc;
+}
+
+static int find_be_port(XenEvtchnState *s, struct xenevtchn_handle *xc)
+{
+    int i;
+
+    for (i = 1; i < EVTCHN_2L_NR_CHANNELS; i++) {
+        if (!s->be_handles[i]) {
+            s->be_handles[i] = xc;
+            xc->be_port = i;
+            return i;
+        }
+    }
+    return 0;
+}
+
+int xen_be_evtchn_bind_interdomain(struct xenevtchn_handle *xc, uint32_t domid,
+                                   evtchn_port_t guest_port)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    XenEvtchnPort *gp;
+    uint16_t be_port = 0;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (!xc) {
+        return -EFAULT;
+    }
+
+    if (domid != xen_domid) {
+        return -ESRCH;
+    }
+
+    if (!valid_port(guest_port)) {
+        return -EINVAL;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    /* The guest has to have an unbound port waiting for us to bind */
+    gp = &s->port_table[guest_port];
+
+    switch (gp->type) {
+    case EVTCHNSTAT_interdomain:
+        /* Allow rebinding after migration, preserve port # if possible */
+        be_port = gp->type_val & ~PORT_INFO_TYPEVAL_REMOTE_QEMU;
+        assert(be_port != 0);
+        if (!s->be_handles[be_port]) {
+            s->be_handles[be_port] = xc;
+            xc->guest_port = guest_port;
+            ret = xc->be_port = be_port;
+            if (kvm_xen_has_cap(EVTCHN_SEND)) {
+                assign_kernel_eventfd(gp->type, guest_port, xc->fd);
+            }
+            break;
+        }
+        /* fall through */
+
+    case EVTCHNSTAT_unbound:
+        be_port = find_be_port(s, xc);
+        if (!be_port) {
+            ret = -ENOSPC;
+            goto out;
+        }
+
+        gp->type = EVTCHNSTAT_interdomain;
+        gp->type_val = be_port | PORT_INFO_TYPEVAL_REMOTE_QEMU;
+        xc->guest_port = guest_port;
+        if (kvm_xen_has_cap(EVTCHN_SEND)) {
+            assign_kernel_eventfd(gp->type, guest_port, xc->fd);
+        }
+        ret = be_port;
+        break;
+
+    default:
+        ret = -EINVAL;
+        break;
+    }
+
+ out:
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
+
+int xen_be_evtchn_unbind(struct xenevtchn_handle *xc, evtchn_port_t port)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (!xc) {
+        return -EFAULT;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    if (port && port != xc->be_port) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (xc->guest_port) {
+        XenEvtchnPort *gp = &s->port_table[xc->guest_port];
+
+        /* This should never *not* be true */
+        if (gp->type == EVTCHNSTAT_interdomain) {
+            gp->type = EVTCHNSTAT_unbound;
+            gp->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+        }
+
+        if (kvm_xen_has_cap(EVTCHN_SEND)) {
+            deassign_kernel_port(xc->guest_port);
+        }
+        xc->guest_port = 0;
+    }
+
+    s->be_handles[xc->be_port] = NULL;
+    xc->be_port = 0;
+    ret = 0;
+ out:
+    qemu_mutex_unlock(&s->port_lock);
+    return ret;
+}
+
+int xen_be_evtchn_close(struct xenevtchn_handle *xc)
+{
+    if (!xc) {
+        return -EFAULT;
+    }
+
+    xen_be_evtchn_unbind(xc, 0);
+
+    close(xc->fd);
+    free(xc);
+    return 0;
+}
+
+int xen_be_evtchn_fd(struct xenevtchn_handle *xc)
+{
+    if (!xc) {
+        return -1;
+    }
+    return xc->fd;
+}
+
+int xen_be_evtchn_notify(struct xenevtchn_handle *xc, evtchn_port_t port)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (!xc) {
+        return -EFAULT;
+    }
+
+    qemu_mutex_lock(&s->port_lock);
+
+    if (xc->guest_port) {
+        set_port_pending(s, xc->guest_port);
+        ret = 0;
+    } else {
+        ret = -ENOTCONN;
+    }
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    return ret;
+}
+
+int xen_be_evtchn_pending(struct xenevtchn_handle *xc)
+{
+    uint64_t val;
+
+    if (!xc) {
+        return -EFAULT;
+    }
+
+    if (!xc->be_port) {
+        return 0;
+    }
+
+    if (eventfd_read(xc->fd, &val)) {
+        return -errno;
+    }
+
+    return val ? xc->be_port : 0;
+}
+
+int xen_be_evtchn_unmask(struct xenevtchn_handle *xc, evtchn_port_t port)
+{
+    if (!xc) {
+        return -EFAULT;
+    }
+
+    if (xc->be_port != port) {
+        return -EINVAL;
+    }
+
+    /*
+     * We don't actually do anything to unmask it; the event was already
+     * consumed in xen_be_evtchn_pending().
+     */
+    return 0;
+}
+
+int xen_be_evtchn_get_guest_port(struct xenevtchn_handle *xc)
+{
+    return xc->guest_port;
+}
+
 EvtchnInfoList *qmp_xen_event_list(Error **errp)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 2461147..5a71ffb 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -14,6 +14,8 @@
 
 #include "hw/sysbus.h"
 
+typedef uint32_t evtchn_port_t;
+
 void xen_evtchn_create(void);
 int xen_evtchn_soft_reset(void);
 int xen_evtchn_set_callback_param(uint64_t param);
@@ -22,6 +24,23 @@ void xen_evtchn_set_callback_level(int level);
 
 int xen_evtchn_set_port(uint16_t port);
 
+/*
+ * These functions mirror the libxenevtchn library API, providing the QEMU
+ * backend side of "interdomain" event channels.
+ */
+struct xenevtchn_handle;
+struct xenevtchn_handle *xen_be_evtchn_open(void);
+int xen_be_evtchn_bind_interdomain(struct xenevtchn_handle *xc, uint32_t domid,
+                                   evtchn_port_t guest_port);
+int xen_be_evtchn_unbind(struct xenevtchn_handle *xc, evtchn_port_t port);
+int xen_be_evtchn_close(struct xenevtchn_handle *xc);
+int xen_be_evtchn_fd(struct xenevtchn_handle *xc);
+int xen_be_evtchn_notify(struct xenevtchn_handle *xc, evtchn_port_t port);
+int xen_be_evtchn_unmask(struct xenevtchn_handle *xc, evtchn_port_t port);
+int xen_be_evtchn_pending(struct xenevtchn_handle *xc);
+/* Apart from this which is a local addition */
+int xen_be_evtchn_get_guest_port(struct xenevtchn_handle *xc);
+
 struct evtchn_status;
 struct evtchn_close;
 struct evtchn_unmask;
-- 
cgit v1.1


From c08f5d0e53b00f101c6aab7b5c7eabe22bab1962 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 23 Dec 2022 17:39:23 +0000
Subject: hw/xen: Add xen_xenstore device for xenstore emulation

Just the basic shell, with the event channel hookup. It only dumps the
buffer for now; a real ring implmentation will come in a subsequent patch.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/meson.build    |   1 +
 hw/i386/kvm/xen_evtchn.c   |   1 +
 hw/i386/kvm/xen_xenstore.c | 252 +++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_xenstore.h |  20 ++++
 hw/i386/pc.c               |   2 +
 5 files changed, 276 insertions(+)
 create mode 100644 hw/i386/kvm/xen_xenstore.c
 create mode 100644 hw/i386/kvm/xen_xenstore.h

(limited to 'hw')

diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 067ff52..82dd6ae 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -8,6 +8,7 @@ i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
   'xen_overlay.c',
   'xen_evtchn.c',
   'xen_gnttab.c',
+  'xen_xenstore.c',
   ))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 601fbee..6c39627 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -34,6 +34,7 @@
 
 #include "xen_evtchn.h"
 #include "xen_overlay.h"
+#include "xen_xenstore.h"
 
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_xen.h"
diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
new file mode 100644
index 0000000..e8abdda
--- /dev/null
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -0,0 +1,252 @@
+/*
+ * QEMU Xen emulation: Shared/overlay pages support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/main-loop.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_overlay.h"
+#include "xen_evtchn.h"
+#include "xen_xenstore.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+
+#include "hw/xen/interface/io/xs_wire.h"
+#include "hw/xen/interface/event_channel.h"
+
+#define TYPE_XEN_XENSTORE "xen-xenstore"
+OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE)
+
+#define XEN_PAGE_SHIFT 12
+#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
+
+#define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
+#define ENTRIES_PER_FRAME_V2 (XEN_PAGE_SIZE / sizeof(grant_entry_v2_t))
+
+#define XENSTORE_HEADER_SIZE ((unsigned int)sizeof(struct xsd_sockmsg))
+
+struct XenXenstoreState {
+    /*< private >*/
+    SysBusDevice busdev;
+    /*< public >*/
+
+    MemoryRegion xenstore_page;
+    struct xenstore_domain_interface *xs;
+    uint8_t req_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX];
+    uint8_t rsp_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX];
+    uint32_t req_offset;
+    uint32_t rsp_offset;
+    bool rsp_pending;
+    bool fatal_error;
+
+    evtchn_port_t guest_port;
+    evtchn_port_t be_port;
+    struct xenevtchn_handle *eh;
+};
+
+struct XenXenstoreState *xen_xenstore_singleton;
+
+static void xen_xenstore_event(void *opaque);
+
+static void xen_xenstore_realize(DeviceState *dev, Error **errp)
+{
+    XenXenstoreState *s = XEN_XENSTORE(dev);
+
+    if (xen_mode != XEN_EMULATE) {
+        error_setg(errp, "Xen xenstore support is for Xen emulation");
+        return;
+    }
+    memory_region_init_ram(&s->xenstore_page, OBJECT(dev), "xen:xenstore_page",
+                           XEN_PAGE_SIZE, &error_abort);
+    memory_region_set_enabled(&s->xenstore_page, true);
+    s->xs = memory_region_get_ram_ptr(&s->xenstore_page);
+    memset(s->xs, 0, XEN_PAGE_SIZE);
+
+    /* We can't map it this early as KVM isn't ready */
+    xen_xenstore_singleton = s;
+
+    s->eh = xen_be_evtchn_open();
+    if (!s->eh) {
+        error_setg(errp, "Xenstore evtchn port init failed");
+        return;
+    }
+    aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true,
+                       xen_xenstore_event, NULL, NULL, NULL, s);
+}
+
+static bool xen_xenstore_is_needed(void *opaque)
+{
+    return xen_mode == XEN_EMULATE;
+}
+
+static int xen_xenstore_pre_save(void *opaque)
+{
+    XenXenstoreState *s = opaque;
+
+    if (s->eh) {
+        s->guest_port = xen_be_evtchn_get_guest_port(s->eh);
+    }
+    return 0;
+}
+
+static int xen_xenstore_post_load(void *opaque, int ver)
+{
+    XenXenstoreState *s = opaque;
+
+    /*
+     * As qemu/dom0, rebind to the guest's port. The Windows drivers may
+     * unbind the XenStore evtchn and rebind to it, having obtained the
+     * "remote" port through EVTCHNOP_status. In the case that migration
+     * occurs while it's unbound, the "remote" port needs to be the same
+     * as before so that the guest can find it, but should remain unbound.
+     */
+    if (s->guest_port) {
+        int be_port = xen_be_evtchn_bind_interdomain(s->eh, xen_domid,
+                                                     s->guest_port);
+        if (be_port < 0) {
+            return be_port;
+        }
+        s->be_port = be_port;
+    }
+    return 0;
+}
+
+static const VMStateDescription xen_xenstore_vmstate = {
+    .name = "xen_xenstore",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = xen_xenstore_is_needed,
+    .pre_save = xen_xenstore_pre_save,
+    .post_load = xen_xenstore_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8_ARRAY(req_data, XenXenstoreState,
+                            sizeof_field(XenXenstoreState, req_data)),
+        VMSTATE_UINT8_ARRAY(rsp_data, XenXenstoreState,
+                            sizeof_field(XenXenstoreState, rsp_data)),
+        VMSTATE_UINT32(req_offset, XenXenstoreState),
+        VMSTATE_UINT32(rsp_offset, XenXenstoreState),
+        VMSTATE_BOOL(rsp_pending, XenXenstoreState),
+        VMSTATE_UINT32(guest_port, XenXenstoreState),
+        VMSTATE_BOOL(fatal_error, XenXenstoreState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void xen_xenstore_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = xen_xenstore_realize;
+    dc->vmsd = &xen_xenstore_vmstate;
+}
+
+static const TypeInfo xen_xenstore_info = {
+    .name          = TYPE_XEN_XENSTORE,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(XenXenstoreState),
+    .class_init    = xen_xenstore_class_init,
+};
+
+void xen_xenstore_create(void)
+{
+    DeviceState *dev = sysbus_create_simple(TYPE_XEN_XENSTORE, -1, NULL);
+
+    xen_xenstore_singleton = XEN_XENSTORE(dev);
+
+    /*
+     * Defer the init (xen_xenstore_reset()) until KVM is set up and the
+     * overlay page can be mapped.
+     */
+}
+
+static void xen_xenstore_register_types(void)
+{
+    type_register_static(&xen_xenstore_info);
+}
+
+type_init(xen_xenstore_register_types)
+
+uint16_t xen_xenstore_get_port(void)
+{
+    XenXenstoreState *s = xen_xenstore_singleton;
+    if (!s) {
+        return 0;
+    }
+    return s->guest_port;
+}
+
+static void xen_xenstore_event(void *opaque)
+{
+    XenXenstoreState *s = opaque;
+    evtchn_port_t port = xen_be_evtchn_pending(s->eh);
+    if (port != s->be_port) {
+        return;
+    }
+    printf("xenstore event\n");
+    /* We know this is a no-op. */
+    xen_be_evtchn_unmask(s->eh, port);
+    qemu_hexdump(stdout, "", s->xs, sizeof(*s->xs));
+    xen_be_evtchn_notify(s->eh, s->be_port);
+}
+
+static void alloc_guest_port(XenXenstoreState *s)
+{
+    struct evtchn_alloc_unbound alloc = {
+        .dom = DOMID_SELF,
+        .remote_dom = DOMID_QEMU,
+    };
+
+    if (!xen_evtchn_alloc_unbound_op(&alloc)) {
+        s->guest_port = alloc.port;
+    }
+}
+
+int xen_xenstore_reset(void)
+{
+    XenXenstoreState *s = xen_xenstore_singleton;
+    int err;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    s->req_offset = s->rsp_offset = 0;
+    s->rsp_pending = false;
+
+    if (!memory_region_is_mapped(&s->xenstore_page)) {
+        uint64_t gpa = XEN_SPECIAL_PFN(XENSTORE) << TARGET_PAGE_BITS;
+        xen_overlay_do_map_page(&s->xenstore_page, gpa);
+    }
+
+    alloc_guest_port(s);
+
+    /*
+     * As qemu/dom0, bind to the guest's port. For incoming migration, this
+     * will be unbound as the guest's evtchn table is overwritten. We then
+     * rebind to the correct guest port in xen_xenstore_post_load().
+     */
+    err = xen_be_evtchn_bind_interdomain(s->eh, xen_domid, s->guest_port);
+    if (err < 0) {
+        return err;
+    }
+    s->be_port = err;
+
+    return 0;
+}
diff --git a/hw/i386/kvm/xen_xenstore.h b/hw/i386/kvm/xen_xenstore.h
new file mode 100644
index 0000000..8c3768e
--- /dev/null
+++ b/hw/i386/kvm/xen_xenstore.h
@@ -0,0 +1,20 @@
+/*
+ * QEMU Xen emulation: Xenstore emulation
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XEN_XENSTORE_H
+#define QEMU_XEN_XENSTORE_H
+
+void xen_xenstore_create(void);
+int xen_xenstore_reset(void);
+
+uint16_t xen_xenstore_get_port(void);
+
+#endif /* QEMU_XEN_XENSTORE_H */
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 1cf112e..f4a08cc 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -93,6 +93,7 @@
 #include "hw/i386/kvm/xen_overlay.h"
 #include "hw/i386/kvm/xen_evtchn.h"
 #include "hw/i386/kvm/xen_gnttab.h"
+#include "hw/i386/kvm/xen_xenstore.h"
 #include "hw/mem/memory-device.h"
 #include "sysemu/replay.h"
 #include "target/i386/cpu.h"
@@ -1862,6 +1863,7 @@ int pc_machine_kvm_type(MachineState *machine, const char *kvm_type)
         xen_overlay_create();
         xen_evtchn_create();
         xen_gnttab_create();
+        xen_xenstore_create();
     }
 #endif
     return 0;
-- 
cgit v1.1


From f3341e7b91548c38d484285307c23b8f9ce73307 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 28 Dec 2022 10:06:49 +0000
Subject: hw/xen: Add basic ring handling to xenstore

Extract requests, return ENOSYS to all of them. This is enough to allow
older Linux guests to boot, as they need *something* back but it doesn't
matter much what.

A full implementation of a single-tentant internal XenStore copy-on-write
tree with transactions and watches is waiting in the wings to be sent in
a subsequent round of patches along with hooking up the actual PV disk
back end in qemu, but this is enough to get guests booting for now.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_xenstore.c | 254 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 251 insertions(+), 3 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index e8abdda..14193ef 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -192,18 +192,266 @@ uint16_t xen_xenstore_get_port(void)
     return s->guest_port;
 }
 
+static bool req_pending(XenXenstoreState *s)
+{
+    struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
+
+    return s->req_offset == XENSTORE_HEADER_SIZE + req->len;
+}
+
+static void reset_req(XenXenstoreState *s)
+{
+    memset(s->req_data, 0, sizeof(s->req_data));
+    s->req_offset = 0;
+}
+
+static void reset_rsp(XenXenstoreState *s)
+{
+    s->rsp_pending = false;
+
+    memset(s->rsp_data, 0, sizeof(s->rsp_data));
+    s->rsp_offset = 0;
+}
+
+static void process_req(XenXenstoreState *s)
+{
+    struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    const char enosys[] = "ENOSYS";
+
+    assert(req_pending(s));
+    assert(!s->rsp_pending);
+
+    rsp->type = XS_ERROR;
+    rsp->req_id = req->req_id;
+    rsp->tx_id = req->tx_id;
+    rsp->len = sizeof(enosys);
+    memcpy((void *)&rsp[1], enosys, sizeof(enosys));
+
+    s->rsp_pending = true;
+    reset_req(s);
+}
+
+static unsigned int copy_from_ring(XenXenstoreState *s, uint8_t *ptr,
+                                   unsigned int len)
+{
+    if (!len) {
+        return 0;
+    }
+
+    XENSTORE_RING_IDX prod = qatomic_read(&s->xs->req_prod);
+    XENSTORE_RING_IDX cons = qatomic_read(&s->xs->req_cons);
+    unsigned int copied = 0;
+
+    /* Ensure the ring contents don't cross the req_prod access. */
+    smp_rmb();
+
+    while (len) {
+        unsigned int avail = prod - cons;
+        unsigned int offset = MASK_XENSTORE_IDX(cons);
+        unsigned int copylen = avail;
+
+        if (avail > XENSTORE_RING_SIZE) {
+            error_report("XenStore ring handling error");
+            s->fatal_error = true;
+            break;
+        } else if (avail == 0) {
+            break;
+        }
+
+        if (copylen > len) {
+            copylen = len;
+        }
+        if (copylen > XENSTORE_RING_SIZE - offset) {
+            copylen = XENSTORE_RING_SIZE - offset;
+        }
+
+        memcpy(ptr, &s->xs->req[offset], copylen);
+        copied += copylen;
+
+        ptr += copylen;
+        len -= copylen;
+
+        cons += copylen;
+    }
+
+    /*
+     * Not sure this ever mattered except on Alpha, but this barrier
+     * is to ensure that the update to req_cons is globally visible
+     * only after we have consumed all the data from the ring, and we
+     * don't end up seeing data written to the ring *after* the other
+     * end sees the update and writes more to the ring. Xen's own
+     * xenstored has the same barrier here (although with no comment
+     * at all, obviously, because it's Xen code).
+     */
+    smp_mb();
+
+    qatomic_set(&s->xs->req_cons, cons);
+
+    return copied;
+}
+
+static unsigned int copy_to_ring(XenXenstoreState *s, uint8_t *ptr,
+                                 unsigned int len)
+{
+    if (!len) {
+        return 0;
+    }
+
+    XENSTORE_RING_IDX cons = qatomic_read(&s->xs->rsp_cons);
+    XENSTORE_RING_IDX prod = qatomic_read(&s->xs->rsp_prod);
+    unsigned int copied = 0;
+
+    /*
+     * This matches the barrier in copy_to_ring() (or the guest's
+     * equivalent) betweem writing the data to the ring and updating
+     * rsp_prod. It protects against the pathological case (which
+     * again I think never happened except on Alpha) where our
+     * subsequent writes to the ring could *cross* the read of
+     * rsp_cons and the guest could see the new data when it was
+     * intending to read the old.
+     */
+    smp_mb();
+
+    while (len) {
+        unsigned int avail = cons + XENSTORE_RING_SIZE - prod;
+        unsigned int offset = MASK_XENSTORE_IDX(prod);
+        unsigned int copylen = len;
+
+        if (avail > XENSTORE_RING_SIZE) {
+            error_report("XenStore ring handling error");
+            s->fatal_error = true;
+            break;
+        } else if (avail == 0) {
+            break;
+        }
+
+        if (copylen > avail) {
+            copylen = avail;
+        }
+        if (copylen > XENSTORE_RING_SIZE - offset) {
+            copylen = XENSTORE_RING_SIZE - offset;
+        }
+
+
+        memcpy(&s->xs->rsp[offset], ptr, copylen);
+        copied += copylen;
+
+        ptr += copylen;
+        len -= copylen;
+
+        prod += copylen;
+    }
+
+    /* Ensure the ring contents are seen before rsp_prod update. */
+    smp_wmb();
+
+    qatomic_set(&s->xs->rsp_prod, prod);
+
+    return copied;
+}
+
+static unsigned int get_req(XenXenstoreState *s)
+{
+    unsigned int copied = 0;
+
+    if (s->fatal_error) {
+        return 0;
+    }
+
+    assert(!req_pending(s));
+
+    if (s->req_offset < XENSTORE_HEADER_SIZE) {
+        void *ptr = s->req_data + s->req_offset;
+        unsigned int len = XENSTORE_HEADER_SIZE;
+        unsigned int copylen = copy_from_ring(s, ptr, len);
+
+        copied += copylen;
+        s->req_offset += copylen;
+    }
+
+    if (s->req_offset >= XENSTORE_HEADER_SIZE) {
+        struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
+
+        if (req->len > (uint32_t)XENSTORE_PAYLOAD_MAX) {
+            error_report("Illegal XenStore request");
+            s->fatal_error = true;
+            return 0;
+        }
+
+        void *ptr = s->req_data + s->req_offset;
+        unsigned int len = XENSTORE_HEADER_SIZE + req->len - s->req_offset;
+        unsigned int copylen = copy_from_ring(s, ptr, len);
+
+        copied += copylen;
+        s->req_offset += copylen;
+    }
+
+    return copied;
+}
+
+static unsigned int put_rsp(XenXenstoreState *s)
+{
+    if (s->fatal_error) {
+        return 0;
+    }
+
+    assert(s->rsp_pending);
+
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    assert(s->rsp_offset < XENSTORE_HEADER_SIZE + rsp->len);
+
+    void *ptr = s->rsp_data + s->rsp_offset;
+    unsigned int len = XENSTORE_HEADER_SIZE + rsp->len - s->rsp_offset;
+    unsigned int copylen = copy_to_ring(s, ptr, len);
+
+    s->rsp_offset += copylen;
+
+    /* Have we produced a complete response? */
+    if (s->rsp_offset == XENSTORE_HEADER_SIZE + rsp->len) {
+        reset_rsp(s);
+    }
+
+    return copylen;
+}
+
 static void xen_xenstore_event(void *opaque)
 {
     XenXenstoreState *s = opaque;
     evtchn_port_t port = xen_be_evtchn_pending(s->eh);
+    unsigned int copied_to, copied_from;
+    bool processed, notify = false;
+
     if (port != s->be_port) {
         return;
     }
-    printf("xenstore event\n");
+
     /* We know this is a no-op. */
     xen_be_evtchn_unmask(s->eh, port);
-    qemu_hexdump(stdout, "", s->xs, sizeof(*s->xs));
-    xen_be_evtchn_notify(s->eh, s->be_port);
+
+    do {
+        copied_to = copied_from = 0;
+        processed = false;
+
+        if (s->rsp_pending) {
+            copied_to = put_rsp(s);
+        }
+
+        if (!req_pending(s)) {
+            copied_from = get_req(s);
+        }
+
+        if (req_pending(s) && !s->rsp_pending) {
+            process_req(s);
+            processed = true;
+        }
+
+        notify |= copied_to || copied_from;
+    } while (copied_to || copied_from || processed);
+
+    if (notify) {
+        xen_be_evtchn_notify(s->eh, s->be_port);
+    }
 }
 
 static void alloc_guest_port(XenXenstoreState *s)
-- 
cgit v1.1


From bdfdb74882b6cff4cc7f6f0f91cd3b34ee8e2216 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 17 Jan 2023 13:51:22 +0000
Subject: hw/xen: Automatically add xen-platform PCI device for emulated Xen
 guests

It isn't strictly mandatory but Linux guests at least will only map
their grant tables over the dummy BAR that it provides, and don't have
sufficient wit to map them in any other unused part of their guest
address space. So include it by default for minimal surprise factor.

As I come to document "how to run a Xen guest in QEMU", this means one
fewer thing to tell the user about, according to the mantra of "if it
needs documenting, fix it first, then document what remains".

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/pc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f4a08cc..fd17ce7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1315,6 +1315,9 @@ void pc_basic_device_init(struct PCMachineState *pcms,
 #ifdef CONFIG_XEN_EMU
     if (xen_mode == XEN_EMULATE) {
         xen_evtchn_connect_gsis(gsi);
+        if (pcms->bus) {
+            pci_create_simple(pcms->bus, -1, "xen-platform");
+        }
     }
 #endif
 
-- 
cgit v1.1


From 799c23548f0d1a7a49b8c5bbc6b9b86e736296c0 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 13 Jan 2023 00:24:40 +0000
Subject: i386/xen: Implement HYPERVISOR_physdev_op

Just hook up the basic hypercalls to stubs in xen_evtchn.c for now.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 25 +++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h | 11 +++++++++++
 2 files changed, 36 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 6c39627..b95ee9b 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -1349,6 +1349,31 @@ int xen_evtchn_set_port(uint16_t port)
     return ret;
 }
 
+int xen_physdev_map_pirq(struct physdev_map_pirq *map)
+{
+    return -ENOTSUP;
+}
+
+int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap)
+{
+    return -ENOTSUP;
+}
+
+int xen_physdev_eoi_pirq(struct physdev_eoi *eoi)
+{
+    return -ENOTSUP;
+}
+
+int xen_physdev_query_pirq(struct physdev_irq_status_query *query)
+{
+    return -ENOTSUP;
+}
+
+int xen_physdev_get_free_pirq(struct physdev_get_free_pirq *get)
+{
+    return -ENOTSUP;
+}
+
 struct xenevtchn_handle *xen_be_evtchn_open(void)
 {
     struct xenevtchn_handle *xc = g_new0(struct xenevtchn_handle, 1);
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 5a71ffb..352c875 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -62,4 +62,15 @@ int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain);
 int xen_evtchn_bind_vcpu_op(struct evtchn_bind_vcpu *vcpu);
 int xen_evtchn_reset_op(struct evtchn_reset *reset);
 
+struct physdev_map_pirq;
+struct physdev_unmap_pirq;
+struct physdev_eoi;
+struct physdev_irq_status_query;
+struct physdev_get_free_pirq;
+int xen_physdev_map_pirq(struct physdev_map_pirq *map);
+int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap);
+int xen_physdev_eoi_pirq(struct physdev_eoi *eoi);
+int xen_physdev_query_pirq(struct physdev_irq_status_query *query);
+int xen_physdev_get_free_pirq(struct physdev_get_free_pirq *get);
+
 #endif /* QEMU_XEN_EVTCHN_H */
-- 
cgit v1.1


From aa98ee38a5e77d1e40987c694abd0c5bc5d6c811 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 13 Jan 2023 19:51:32 +0000
Subject: hw/xen: Implement emulated PIRQ hypercall support

This wires up the basic infrastructure but the actual interrupts aren't
there yet, so don't advertise it to the guest.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/trace-events |   4 +
 hw/i386/kvm/trace.h      |   1 +
 hw/i386/kvm/xen_evtchn.c | 306 ++++++++++++++++++++++++++++++++++++++++++++++-
 hw/i386/kvm/xen_evtchn.h |   2 +
 4 files changed, 308 insertions(+), 5 deletions(-)
 create mode 100644 hw/i386/kvm/trace-events
 create mode 100644 hw/i386/kvm/trace.h

(limited to 'hw')

diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events
new file mode 100644
index 0000000..04e60c5
--- /dev/null
+++ b/hw/i386/kvm/trace-events
@@ -0,0 +1,4 @@
+kvm_xen_map_pirq(int pirq, int gsi) "pirq %d gsi %d"
+kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d"
+kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d"
+kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d"
diff --git a/hw/i386/kvm/trace.h b/hw/i386/kvm/trace.h
new file mode 100644
index 0000000..e55d081
--- /dev/null
+++ b/hw/i386/kvm/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_i386_kvm.h"
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index b95ee9b..59eea63 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -24,6 +24,7 @@
 #include "exec/target_page.h"
 #include "exec/address-spaces.h"
 #include "migration/vmstate.h"
+#include "trace.h"
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
@@ -105,6 +106,21 @@ struct xenevtchn_handle {
 #define PORT_INFO_TYPEVAL_REMOTE_QEMU           0x8000
 #define PORT_INFO_TYPEVAL_REMOTE_PORT_MASK      0x7FFF
 
+/*
+ * These 'emuirq' values are used by Xen in the LM stream... and yes, I am
+ * insane enough to think about guest-transparent live migration from actual
+ * Xen to QEMU, and ensuring that we can convert/consume the stream.
+ */
+#define IRQ_UNBOUND -1
+#define IRQ_PT -2
+#define IRQ_MSI_EMU -3
+
+
+struct pirq_info {
+    int gsi;
+    uint16_t port;
+};
+
 struct XenEvtchnState {
     /*< private >*/
     SysBusDevice busdev;
@@ -122,8 +138,25 @@ struct XenEvtchnState {
     qemu_irq gsis[IOAPIC_NUM_PINS];
 
     struct xenevtchn_handle *be_handles[EVTCHN_2L_NR_CHANNELS];
+
+    uint32_t nr_pirqs;
+
+    /* Bitmap of allocated PIRQs (serialized) */
+    uint16_t nr_pirq_inuse_words;
+    uint64_t *pirq_inuse_bitmap;
+
+    /* GSI → PIRQ mapping (serialized) */
+    uint16_t gsi_pirq[IOAPIC_NUM_PINS];
+
+    /* Per-PIRQ information (rebuilt on migration) */
+    struct pirq_info *pirq;
 };
 
+#define pirq_inuse_word(s, pirq) (s->pirq_inuse_bitmap[((pirq) / 64)])
+#define pirq_inuse_bit(pirq) (1ULL << ((pirq) & 63))
+
+#define pirq_inuse(s, pirq) (pirq_inuse_word(s, pirq) & pirq_inuse_bit(pirq))
+
 struct XenEvtchnState *xen_evtchn_singleton;
 
 /* Top bits of callback_param are the type (HVM_PARAM_CALLBACK_TYPE_xxx) */
@@ -138,17 +171,45 @@ static int xen_evtchn_pre_load(void *opaque)
     /* Unbind all the backend-side ports; they need to rebind */
     unbind_backend_ports(s);
 
+    /* It'll be leaked otherwise. */
+    g_free(s->pirq_inuse_bitmap);
+    s->pirq_inuse_bitmap = NULL;
+
     return 0;
 }
 
 static int xen_evtchn_post_load(void *opaque, int version_id)
 {
     XenEvtchnState *s = opaque;
+    uint32_t i;
 
     if (s->callback_param) {
         xen_evtchn_set_callback_param(s->callback_param);
     }
 
+    /* Rebuild s->pirq[].port mapping */
+    for (i = 0; i < s->nr_ports; i++) {
+        XenEvtchnPort *p = &s->port_table[i];
+
+        if (p->type == EVTCHNSTAT_pirq) {
+            assert(p->type_val);
+            assert(p->type_val < s->nr_pirqs);
+
+            /*
+             * Set the gsi to IRQ_UNBOUND; it may be changed to an actual
+             * GSI# below, or to IRQ_MSI_EMU when the MSI table snooping
+             * catches up with it.
+             */
+            s->pirq[p->type_val].gsi = IRQ_UNBOUND;
+            s->pirq[p->type_val].port = i;
+        }
+    }
+    /* Rebuild s->pirq[].gsi mapping */
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        if (s->gsi_pirq[i]) {
+            s->pirq[s->gsi_pirq[i]].gsi = i;
+        }
+    }
     return 0;
 }
 
@@ -181,6 +242,10 @@ static const VMStateDescription xen_evtchn_vmstate = {
         VMSTATE_UINT32(nr_ports, XenEvtchnState),
         VMSTATE_STRUCT_VARRAY_UINT32(port_table, XenEvtchnState, nr_ports, 1,
                                      xen_evtchn_port_vmstate, XenEvtchnPort),
+        VMSTATE_UINT16_ARRAY(gsi_pirq, XenEvtchnState, IOAPIC_NUM_PINS),
+        VMSTATE_VARRAY_UINT16_ALLOC(pirq_inuse_bitmap, XenEvtchnState,
+                                    nr_pirq_inuse_words, 0,
+                                    vmstate_info_uint64, uint64_t),
         VMSTATE_END_OF_LIST()
     }
 };
@@ -221,6 +286,23 @@ void xen_evtchn_create(void)
     for (i = 0; i < IOAPIC_NUM_PINS; i++) {
         sysbus_init_irq(SYS_BUS_DEVICE(s), &s->gsis[i]);
     }
+
+    /*
+     * We could parameterise the number of PIRQs available if needed,
+     * but for now limit it to 256. The Xen scheme for encoding PIRQ#
+     * into an MSI message is not compatible with 32-bit MSI, as it
+     * puts the high bits of the PIRQ# into the high bits of the MSI
+     * message address, instead of using the Extended Destination ID
+     * in address bits 4-11 which perhaps would have been a better
+     * choice. So to keep life simple, just stick with 256 as the
+     * default, which conveniently doesn't need to set anything
+     * outside the low 32 bits of the address.
+     */
+    s->nr_pirqs = 256;
+
+    s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64);
+    s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words);
+    s->pirq = g_new0(struct pirq_info, s->nr_pirqs);
 }
 
 void xen_evtchn_connect_gsis(qemu_irq *system_gsis)
@@ -929,6 +1011,10 @@ static int close_port(XenEvtchnState *s, evtchn_port_t port)
     case EVTCHNSTAT_closed:
         return -ENOENT;
 
+    case EVTCHNSTAT_pirq:
+        s->pirq[p->type_val].port = 0;
+        break;
+
     case EVTCHNSTAT_virq:
         kvm_xen_set_vcpu_virq(virq_is_global(p->type_val) ? 0 : p->vcpu,
                               p->type_val, 0);
@@ -1123,6 +1209,37 @@ int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq)
     return ret;
 }
 
+int xen_evtchn_bind_pirq_op(struct evtchn_bind_pirq *pirq)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int ret;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (pirq->pirq >= s->nr_pirqs) {
+        return -EINVAL;
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    if (s->pirq[pirq->pirq].port) {
+        return -EBUSY;
+    }
+
+    ret = allocate_port(s, 0, EVTCHNSTAT_pirq, pirq->pirq,
+                        &pirq->port);
+    if (ret) {
+        return ret;
+    }
+
+    s->pirq[pirq->pirq].port = pirq->port;
+    trace_kvm_xen_bind_pirq(pirq->pirq, pirq->port);
+
+    return ret;
+}
+
 int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
@@ -1349,29 +1466,208 @@ int xen_evtchn_set_port(uint16_t port)
     return ret;
 }
 
+static int allocate_pirq(XenEvtchnState *s, int type, int gsi)
+{
+    uint16_t pirq;
+
+    /*
+     * Preserve the allocation strategy that Xen has. It looks like
+     * we *never* give out PIRQ 0-15, we give out 16-nr_irqs_gsi only
+     * to GSIs (counting up from 16), and then we count backwards from
+     * the top for MSIs or when the GSI space is exhausted.
+     */
+    if (type == MAP_PIRQ_TYPE_GSI) {
+        for (pirq = 16 ; pirq < IOAPIC_NUM_PINS; pirq++) {
+            if (pirq_inuse(s, pirq)) {
+                continue;
+            }
+
+            /* Found it */
+            goto found;
+        }
+    }
+    for (pirq = s->nr_pirqs - 1; pirq >= IOAPIC_NUM_PINS; pirq--) {
+        /* Skip whole words at a time when they're full */
+        if (pirq_inuse_word(s, pirq) == UINT64_MAX) {
+            pirq &= ~63ULL;
+            continue;
+        }
+        if (pirq_inuse(s, pirq)) {
+            continue;
+        }
+
+        goto found;
+    }
+    return -ENOSPC;
+
+ found:
+    pirq_inuse_word(s, pirq) |= pirq_inuse_bit(pirq);
+    if (gsi >= 0) {
+        assert(gsi <= IOAPIC_NUM_PINS);
+        s->gsi_pirq[gsi] = pirq;
+    }
+    s->pirq[pirq].gsi = gsi;
+    return pirq;
+}
+
 int xen_physdev_map_pirq(struct physdev_map_pirq *map)
 {
-    return -ENOTSUP;
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int pirq = map->pirq;
+    int gsi = map->index;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    if (map->domid != DOMID_SELF && map->domid != xen_domid) {
+        return -EPERM;
+    }
+    if (map->type != MAP_PIRQ_TYPE_GSI) {
+        return -EINVAL;
+    }
+    if (gsi < 0 || gsi >= IOAPIC_NUM_PINS) {
+        return -EINVAL;
+    }
+
+    if (pirq < 0) {
+        pirq = allocate_pirq(s, map->type, gsi);
+        if (pirq < 0) {
+            return pirq;
+        }
+        map->pirq = pirq;
+    } else if (pirq > s->nr_pirqs) {
+        return -EINVAL;
+    } else {
+        /*
+         * User specified a valid-looking PIRQ#. Allow it if it is
+         * allocated and not yet bound, or if it is unallocated
+         */
+        if (pirq_inuse(s, pirq)) {
+            if (s->pirq[pirq].gsi != IRQ_UNBOUND) {
+                return -EBUSY;
+            }
+        } else {
+            /* If it was unused, mark it used now. */
+            pirq_inuse_word(s, pirq) |= pirq_inuse_bit(pirq);
+        }
+        /* Set the mapping in both directions. */
+        s->pirq[pirq].gsi = gsi;
+        s->gsi_pirq[gsi] = pirq;
+    }
+
+    trace_kvm_xen_map_pirq(pirq, gsi);
+    return 0;
 }
 
 int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap)
 {
-    return -ENOTSUP;
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int pirq = unmap->pirq;
+    int gsi;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (unmap->domid != DOMID_SELF && unmap->domid != xen_domid) {
+        return -EPERM;
+    }
+    if (pirq < 0 || pirq >= s->nr_pirqs) {
+        return -EINVAL;
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    if (!pirq_inuse(s, pirq)) {
+        return -ENOENT;
+    }
+
+    gsi = s->pirq[pirq].gsi;
+
+    /* We can only unmap GSI PIRQs */
+    if (gsi < 0) {
+        return -EINVAL;
+    }
+
+    s->gsi_pirq[gsi] = 0;
+    s->pirq[pirq].gsi = IRQ_UNBOUND; /* Doesn't actually matter because: */
+    pirq_inuse_word(s, pirq) &= ~pirq_inuse_bit(pirq);
+
+    trace_kvm_xen_unmap_pirq(pirq, gsi);
+    return 0;
 }
 
 int xen_physdev_eoi_pirq(struct physdev_eoi *eoi)
 {
-    return -ENOTSUP;
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int pirq = eoi->irq;
+    int gsi;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    if (!pirq_inuse(s, pirq)) {
+        return -ENOENT;
+    }
+
+    gsi = s->pirq[pirq].gsi;
+    if (gsi < 0) {
+        return -EINVAL;
+    }
+
+    /* XX: Reassert a level IRQ if needed */
+    return 0;
 }
 
 int xen_physdev_query_pirq(struct physdev_irq_status_query *query)
 {
-    return -ENOTSUP;
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int pirq = query->irq;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    if (!pirq_inuse(s, pirq)) {
+        return -ENOENT;
+    }
+
+    if (s->pirq[pirq].gsi >= 0) {
+        query->flags = XENIRQSTAT_needs_eoi;
+    } else {
+        query->flags = 0;
+    }
+
+    return 0;
 }
 
 int xen_physdev_get_free_pirq(struct physdev_get_free_pirq *get)
 {
-    return -ENOTSUP;
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int pirq;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    pirq = allocate_pirq(s, get->type, IRQ_UNBOUND);
+    if (pirq < 0) {
+        return pirq;
+    }
+
+    get->pirq = pirq;
+    trace_kvm_xen_get_free_pirq(pirq, get->type);
+    return 0;
 }
 
 struct xenevtchn_handle *xen_be_evtchn_open(void)
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 352c875..a7383f7 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -45,6 +45,7 @@ struct evtchn_status;
 struct evtchn_close;
 struct evtchn_unmask;
 struct evtchn_bind_virq;
+struct evtchn_bind_pirq;
 struct evtchn_bind_ipi;
 struct evtchn_send;
 struct evtchn_alloc_unbound;
@@ -55,6 +56,7 @@ int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
 int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
+int xen_evtchn_bind_pirq_op(struct evtchn_bind_pirq *pirq);
 int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
 int xen_evtchn_send_op(struct evtchn_send *send);
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
-- 
cgit v1.1


From 4f81baa33ed645fc17a9908236630b8154502ae5 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 13 Jan 2023 20:41:19 +0000
Subject: hw/xen: Support GSI mapping to PIRQ

If I advertise XENFEAT_hvm_pirqs then a guest now boots successfully as
long as I tell it 'pci=nomsi'.

[root@localhost ~]# cat /proc/interrupts
           CPU0
  0:         52   IO-APIC   2-edge      timer
  1:         16  xen-pirq   1-ioapic-edge  i8042
  4:       1534  xen-pirq   4-ioapic-edge  ttyS0
  8:          1  xen-pirq   8-ioapic-edge  rtc0
  9:          0  xen-pirq   9-ioapic-level  acpi
 11:       5648  xen-pirq  11-ioapic-level  ahci[0000:00:04.0]
 12:        257  xen-pirq  12-ioapic-edge  i8042
...

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++-
 hw/i386/kvm/xen_evtchn.h |  2 ++
 hw/i386/x86.c            | 16 +++++++++++++
 3 files changed, 75 insertions(+), 1 deletion(-)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 59eea63..f2c4b43 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -148,6 +148,9 @@ struct XenEvtchnState {
     /* GSI → PIRQ mapping (serialized) */
     uint16_t gsi_pirq[IOAPIC_NUM_PINS];
 
+    /* Per-GSI assertion state (serialized) */
+    uint32_t pirq_gsi_set;
+
     /* Per-PIRQ information (rebuilt on migration) */
     struct pirq_info *pirq;
 };
@@ -246,6 +249,7 @@ static const VMStateDescription xen_evtchn_vmstate = {
         VMSTATE_VARRAY_UINT16_ALLOC(pirq_inuse_bitmap, XenEvtchnState,
                                     nr_pirq_inuse_words, 0,
                                     vmstate_info_uint64, uint64_t),
+        VMSTATE_UINT32(pirq_gsi_set, XenEvtchnState),
         VMSTATE_END_OF_LIST()
     }
 };
@@ -1510,6 +1514,51 @@ static int allocate_pirq(XenEvtchnState *s, int type, int gsi)
     return pirq;
 }
 
+bool xen_evtchn_set_gsi(int gsi, int level)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int pirq;
+
+    assert(qemu_mutex_iothread_locked());
+
+    if (!s || gsi < 0 || gsi > IOAPIC_NUM_PINS) {
+        return false;
+    }
+
+    /*
+     * Check that that it *isn't* the event channel GSI, and thus
+     * that we are not recursing and it's safe to take s->port_lock.
+     *
+     * Locking aside, it's perfectly sane to bail out early for that
+     * special case, as it would make no sense for the event channel
+     * GSI to be routed back to event channels, when the delivery
+     * method is to raise the GSI... that recursion wouldn't *just*
+     * be a locking issue.
+     */
+    if (gsi && gsi == s->callback_gsi) {
+        return false;
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    pirq = s->gsi_pirq[gsi];
+    if (!pirq) {
+        return false;
+    }
+
+    if (level) {
+        int port = s->pirq[pirq].port;
+
+        s->pirq_gsi_set |= (1U << gsi);
+        if (port) {
+            set_port_pending(s, port);
+        }
+    } else {
+        s->pirq_gsi_set &= ~(1U << gsi);
+    }
+    return true;
+}
+
 int xen_physdev_map_pirq(struct physdev_map_pirq *map)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
@@ -1621,7 +1670,14 @@ int xen_physdev_eoi_pirq(struct physdev_eoi *eoi)
         return -EINVAL;
     }
 
-    /* XX: Reassert a level IRQ if needed */
+    /* Reassert a level IRQ if needed */
+    if (s->pirq_gsi_set & (1U << gsi)) {
+        int port = s->pirq[pirq].port;
+        if (port) {
+            set_port_pending(s, port);
+        }
+    }
+
     return 0;
 }
 
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index a7383f7..95400b7 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -24,6 +24,8 @@ void xen_evtchn_set_callback_level(int level);
 
 int xen_evtchn_set_port(uint16_t port);
 
+bool xen_evtchn_set_gsi(int gsi, int level);
+
 /*
  * These functions mirror the libxenevtchn library API, providing the QEMU
  * backend side of "interdomain" event channels.
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index c44846f..a56b10b 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -61,6 +61,11 @@
 #include CONFIG_DEVICES
 #include "kvm/kvm_i386.h"
 
+#ifdef CONFIG_XEN_EMU
+#include "hw/xen/xen.h"
+#include "hw/i386/kvm/xen_evtchn.h"
+#endif
+
 /* Physical Address of PVH entry point read from kernel ELF NOTE */
 static size_t pvh_start_addr;
 
@@ -610,6 +615,17 @@ void gsi_handler(void *opaque, int n, int level)
         }
         /* fall through */
     case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1:
+#ifdef CONFIG_XEN_EMU
+        /*
+         * Xen delivers the GSI to the Legacy PIC (not that Legacy PIC
+         * routing actually works properly under Xen). And then to
+         * *either* the PIRQ handling or the I/OAPIC depending on
+         * whether the former wants it.
+         */
+        if (xen_mode == XEN_EMULATE && xen_evtchn_set_gsi(n, level)) {
+            break;
+        }
+#endif
         qemu_set_irq(s->ioapic_irq[n], level);
         break;
     case IO_APIC_SECONDARY_IRQBASE
-- 
cgit v1.1


From 6096cf7877bc6ee84e6b3b44dfe144bc8b549724 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 13 Jan 2023 23:35:46 +0000
Subject: hw/xen: Support MSI mapping to PIRQ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The way that Xen handles MSI PIRQs is kind of awful.

There is a special MSI message which targets a PIRQ. The vector in the
low bits of data must be zero. The low 8 bits of the PIRQ# are in the
destination ID field, the extended destination ID field is unused, and
instead the high bits of the PIRQ# are in the high 32 bits of the address.

Using the high bits of the address means that we can't intercept and
translate these messages in kvm_send_msi(), because they won't be caught
by the APIC — addresses like 0x1000fee46000 aren't in the APIC's range.

So we catch them in pci_msi_trigger() instead, and deliver the event
channel directly.

That isn't even the worst part. The worst part is that Xen snoops on
writes to devices' MSI vectors while they are *masked*. When a MSI
message is written which looks like it targets a PIRQ, it remembers
the device and vector for later.

When the guest makes a hypercall to bind that PIRQ# (snooped from a
marked MSI vector) to an event channel port, Xen *unmasks* that MSI
vector on the device. Xen guests using PIRQ delivery of MSI don't
ever actually unmask the MSI for themselves.

Now that this is working we can finally enable XENFEAT_hvm_pirqs and
let the guest use it all.

Tested with passthrough igb and emulated e1000e + AHCI.

           CPU0       CPU1
  0:         65          0   IO-APIC   2-edge      timer
  1:          0         14  xen-pirq   1-ioapic-edge  i8042
  4:          0        846  xen-pirq   4-ioapic-edge  ttyS0
  8:          1          0  xen-pirq   8-ioapic-edge  rtc0
  9:          0          0  xen-pirq   9-ioapic-level  acpi
 12:        257          0  xen-pirq  12-ioapic-edge  i8042
 24:       9600          0  xen-percpu    -virq      timer0
 25:       2758          0  xen-percpu    -ipi       resched0
 26:          0          0  xen-percpu    -ipi       callfunc0
 27:          0          0  xen-percpu    -virq      debug0
 28:       1526          0  xen-percpu    -ipi       callfuncsingle0
 29:          0          0  xen-percpu    -ipi       spinlock0
 30:          0       8608  xen-percpu    -virq      timer1
 31:          0        874  xen-percpu    -ipi       resched1
 32:          0          0  xen-percpu    -ipi       callfunc1
 33:          0          0  xen-percpu    -virq      debug1
 34:          0       1617  xen-percpu    -ipi       callfuncsingle1
 35:          0          0  xen-percpu    -ipi       spinlock1
 36:          8          0   xen-dyn    -event     xenbus
 37:          0       6046  xen-pirq    -msi       ahci[0000:00:03.0]
 38:          1          0  xen-pirq    -msi-x     ens4
 39:          0         73  xen-pirq    -msi-x     ens4-rx-0
 40:         14          0  xen-pirq    -msi-x     ens4-rx-1
 41:          0         32  xen-pirq    -msi-x     ens4-tx-0
 42:         47          0  xen-pirq    -msi-x     ens4-tx-1

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/trace-events |   1 +
 hw/i386/kvm/xen-stubs.c  |  16 +++
 hw/i386/kvm/xen_evtchn.c | 262 +++++++++++++++++++++++++++++++++++++++++++++--
 hw/i386/kvm/xen_evtchn.h |   8 ++
 hw/pci/msi.c             |  11 ++
 hw/pci/msix.c            |   9 ++
 hw/pci/pci.c             |  17 +++
 7 files changed, 317 insertions(+), 7 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events
index 04e60c5..b83c3eb 100644
--- a/hw/i386/kvm/trace-events
+++ b/hw/i386/kvm/trace-events
@@ -2,3 +2,4 @@ kvm_xen_map_pirq(int pirq, int gsi) "pirq %d gsi %d"
 kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d"
 kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d"
 kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d"
+kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d"
diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c
index 720590a..ae406e0 100644
--- a/hw/i386/kvm/xen-stubs.c
+++ b/hw/i386/kvm/xen-stubs.c
@@ -14,6 +14,22 @@
 #include "qapi/error.h"
 #include "qapi/qapi-commands-misc-target.h"
 
+#include "xen_evtchn.h"
+
+void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector,
+                          uint64_t addr, uint32_t data, bool is_masked)
+{
+}
+
+void xen_evtchn_remove_pci_device(PCIDevice *dev)
+{
+}
+
+bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data)
+{
+    return false;
+}
+
 #ifdef TARGET_I386
 EvtchnInfoList *qmp_xen_event_list(Error **errp)
 {
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index f2c4b43..69c0204 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -31,6 +31,8 @@
 #include "hw/i386/x86.h"
 #include "hw/i386/pc.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
 #include "hw/irq.h"
 
 #include "xen_evtchn.h"
@@ -45,6 +47,9 @@
 #include "hw/xen/interface/memory.h"
 #include "hw/xen/interface/hvm/params.h"
 
+/* XX: For kvm_update_msi_routes_all() */
+#include "target/i386/kvm/kvm_i386.h"
+
 #define TYPE_XEN_EVTCHN "xen-evtchn"
 OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN)
 
@@ -119,6 +124,11 @@ struct xenevtchn_handle {
 struct pirq_info {
     int gsi;
     uint16_t port;
+    PCIDevice *dev;
+    int vector;
+    bool is_msix;
+    bool is_masked;
+    bool is_translated;
 };
 
 struct XenEvtchnState {
@@ -151,7 +161,7 @@ struct XenEvtchnState {
     /* Per-GSI assertion state (serialized) */
     uint32_t pirq_gsi_set;
 
-    /* Per-PIRQ information (rebuilt on migration) */
+    /* Per-PIRQ information (rebuilt on migration, protected by BQL) */
     struct pirq_info *pirq;
 };
 
@@ -1007,16 +1017,23 @@ static bool virq_is_global(uint32_t virq)
     }
 }
 
-static int close_port(XenEvtchnState *s, evtchn_port_t port)
+static int close_port(XenEvtchnState *s, evtchn_port_t port,
+                      bool *flush_kvm_routes)
 {
     XenEvtchnPort *p = &s->port_table[port];
 
+    /* Because it *might* be a PIRQ port */
+    assert(qemu_mutex_iothread_locked());
+
     switch (p->type) {
     case EVTCHNSTAT_closed:
         return -ENOENT;
 
     case EVTCHNSTAT_pirq:
         s->pirq[p->type_val].port = 0;
+        if (s->pirq[p->type_val].is_translated) {
+            *flush_kvm_routes = true;
+        }
         break;
 
     case EVTCHNSTAT_virq:
@@ -1065,6 +1082,7 @@ static int close_port(XenEvtchnState *s, evtchn_port_t port)
 int xen_evtchn_soft_reset(void)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
+    bool flush_kvm_routes;
     int i;
 
     if (!s) {
@@ -1073,10 +1091,16 @@ int xen_evtchn_soft_reset(void)
 
     assert(qemu_mutex_iothread_locked());
 
-    QEMU_LOCK_GUARD(&s->port_lock);
+    qemu_mutex_lock(&s->port_lock);
 
     for (i = 0; i < s->nr_ports; i++) {
-        close_port(s, i);
+        close_port(s, i, &flush_kvm_routes);
+    }
+
+    qemu_mutex_unlock(&s->port_lock);
+
+    if (flush_kvm_routes) {
+        kvm_update_msi_routes_all(NULL, true, 0, 0);
     }
 
     return 0;
@@ -1094,6 +1118,7 @@ int xen_evtchn_reset_op(struct evtchn_reset *reset)
 int xen_evtchn_close_op(struct evtchn_close *close)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
+    bool flush_kvm_routes = false;
     int ret;
 
     if (!s) {
@@ -1104,12 +1129,17 @@ int xen_evtchn_close_op(struct evtchn_close *close)
         return -EINVAL;
     }
 
+    QEMU_IOTHREAD_LOCK_GUARD();
     qemu_mutex_lock(&s->port_lock);
 
-    ret = close_port(s, close->port);
+    ret = close_port(s, close->port, &flush_kvm_routes);
 
     qemu_mutex_unlock(&s->port_lock);
 
+    if (flush_kvm_routes) {
+        kvm_update_msi_routes_all(NULL, true, 0, 0);
+    }
+
     return ret;
 }
 
@@ -1226,21 +1256,54 @@ int xen_evtchn_bind_pirq_op(struct evtchn_bind_pirq *pirq)
         return -EINVAL;
     }
 
-    QEMU_LOCK_GUARD(&s->port_lock);
+    QEMU_IOTHREAD_LOCK_GUARD();
 
     if (s->pirq[pirq->pirq].port) {
         return -EBUSY;
     }
 
+    qemu_mutex_lock(&s->port_lock);
+
     ret = allocate_port(s, 0, EVTCHNSTAT_pirq, pirq->pirq,
                         &pirq->port);
     if (ret) {
+        qemu_mutex_unlock(&s->port_lock);
         return ret;
     }
 
     s->pirq[pirq->pirq].port = pirq->port;
     trace_kvm_xen_bind_pirq(pirq->pirq, pirq->port);
 
+    qemu_mutex_unlock(&s->port_lock);
+
+    /*
+     * Need to do the unmask outside port_lock because it may call
+     * back into the MSI translate function.
+     */
+    if (s->pirq[pirq->pirq].gsi == IRQ_MSI_EMU) {
+        if (s->pirq[pirq->pirq].is_masked) {
+            PCIDevice *dev = s->pirq[pirq->pirq].dev;
+            int vector = s->pirq[pirq->pirq].vector;
+            char *dev_path = qdev_get_dev_path(DEVICE(dev));
+
+            trace_kvm_xen_unmask_pirq(pirq->pirq, dev_path, vector);
+            g_free(dev_path);
+
+            if (s->pirq[pirq->pirq].is_msix) {
+                msix_set_mask(dev, vector, false);
+            } else {
+                msi_set_mask(dev, vector, false, NULL);
+            }
+        } else if (s->pirq[pirq->pirq].is_translated) {
+            /*
+             * If KVM had attempted to translate this one before, make it try
+             * again. If we unmasked, then the notifier on the MSI(-X) vector
+             * will already have had the same effect.
+             */
+            kvm_update_msi_routes_all(NULL, true, 0, 0);
+        }
+    }
+
     return ret;
 }
 
@@ -1559,6 +1622,179 @@ bool xen_evtchn_set_gsi(int gsi, int level)
     return true;
 }
 
+static uint32_t msi_pirq_target(uint64_t addr, uint32_t data)
+{
+    /* The vector (in low 8 bits of data) must be zero */
+    if (data & 0xff) {
+        return 0;
+    }
+
+    uint32_t pirq = (addr & 0xff000) >> 12;
+    pirq |= (addr >> 32) & 0xffffff00;
+
+    return pirq;
+}
+
+static void do_remove_pci_vector(XenEvtchnState *s, PCIDevice *dev, int vector,
+                                 int except_pirq)
+{
+    uint32_t pirq;
+
+    for (pirq = 0; pirq < s->nr_pirqs; pirq++) {
+        /*
+         * We could be cleverer here, but it isn't really a fast path, and
+         * this trivial optimisation is enough to let us skip the big gap
+         * in the middle a bit quicker (in terms of both loop iterations,
+         * and cache lines).
+         */
+        if (!(pirq & 63) && !(pirq_inuse_word(s, pirq))) {
+            pirq += 64;
+            continue;
+        }
+        if (except_pirq && pirq == except_pirq) {
+            continue;
+        }
+        if (s->pirq[pirq].dev != dev) {
+            continue;
+        }
+        if (vector != -1 && s->pirq[pirq].vector != vector) {
+            continue;
+        }
+
+        /* It could theoretically be bound to a port already, but that is OK. */
+        s->pirq[pirq].dev = dev;
+        s->pirq[pirq].gsi = IRQ_UNBOUND;
+        s->pirq[pirq].is_msix = false;
+        s->pirq[pirq].vector = 0;
+        s->pirq[pirq].is_masked = false;
+        s->pirq[pirq].is_translated = false;
+    }
+}
+
+void xen_evtchn_remove_pci_device(PCIDevice *dev)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+
+    if (!s) {
+        return;
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+    do_remove_pci_vector(s, dev, -1, 0);
+}
+
+void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector,
+                          uint64_t addr, uint32_t data, bool is_masked)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    uint32_t pirq;
+
+    if (!s) {
+        return;
+    }
+
+    assert(qemu_mutex_iothread_locked());
+
+    pirq = msi_pirq_target(addr, data);
+
+    /*
+     * The PIRQ# must be sane, and there must be an allocated PIRQ in
+     * IRQ_UNBOUND or IRQ_MSI_EMU state to match it.
+     */
+    if (!pirq || pirq >= s->nr_pirqs || !pirq_inuse(s, pirq) ||
+        (s->pirq[pirq].gsi != IRQ_UNBOUND &&
+         s->pirq[pirq].gsi != IRQ_MSI_EMU)) {
+        pirq = 0;
+    }
+
+    if (pirq) {
+        s->pirq[pirq].dev = dev;
+        s->pirq[pirq].gsi = IRQ_MSI_EMU;
+        s->pirq[pirq].is_msix = is_msix;
+        s->pirq[pirq].vector = vector;
+        s->pirq[pirq].is_masked = is_masked;
+    }
+
+    /* Remove any (other) entries for this {device, vector} */
+    do_remove_pci_vector(s, dev, vector, pirq);
+}
+
+int xen_evtchn_translate_pirq_msi(struct kvm_irq_routing_entry *route,
+                                  uint64_t address, uint32_t data)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    uint32_t pirq, port;
+    CPUState *cpu;
+
+    if (!s) {
+        return 1; /* Not a PIRQ */
+    }
+
+    assert(qemu_mutex_iothread_locked());
+
+    pirq = msi_pirq_target(address, data);
+    if (!pirq || pirq >= s->nr_pirqs) {
+        return 1; /* Not a PIRQ */
+    }
+
+    if (!kvm_xen_has_cap(EVTCHN_2LEVEL)) {
+        return -ENOTSUP;
+    }
+
+    if (s->pirq[pirq].gsi != IRQ_MSI_EMU) {
+        return -EINVAL;
+    }
+
+    /* Remember that KVM tried to translate this. It might need to try again. */
+    s->pirq[pirq].is_translated = true;
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    port = s->pirq[pirq].port;
+    if (!valid_port(port)) {
+        return -EINVAL;
+    }
+
+    cpu = qemu_get_cpu(s->port_table[port].vcpu);
+    if (!cpu) {
+        return -EINVAL;
+    }
+
+    route->type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+    route->u.xen_evtchn.port = port;
+    route->u.xen_evtchn.vcpu = kvm_arch_vcpu_id(cpu);
+    route->u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+    return 0; /* Handled */
+}
+
+bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    uint32_t pirq, port;
+
+    if (!s) {
+        return false;
+    }
+
+    assert(qemu_mutex_iothread_locked());
+
+    pirq = msi_pirq_target(address, data);
+    if (!pirq || pirq >= s->nr_pirqs) {
+        return false;
+    }
+
+    QEMU_LOCK_GUARD(&s->port_lock);
+
+    port = s->pirq[pirq].port;
+    if (!valid_port(port)) {
+        return false;
+    }
+
+    set_port_pending(s, port);
+    return true;
+}
+
 int xen_physdev_map_pirq(struct physdev_map_pirq *map)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
@@ -1569,6 +1805,7 @@ int xen_physdev_map_pirq(struct physdev_map_pirq *map)
         return -ENOTSUP;
     }
 
+    QEMU_IOTHREAD_LOCK_GUARD();
     QEMU_LOCK_GUARD(&s->port_lock);
 
     if (map->domid != DOMID_SELF && map->domid != xen_domid) {
@@ -1628,9 +1865,11 @@ int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap)
         return -EINVAL;
     }
 
-    QEMU_LOCK_GUARD(&s->port_lock);
+    QEMU_IOTHREAD_LOCK_GUARD();
+    qemu_mutex_lock(&s->port_lock);
 
     if (!pirq_inuse(s, pirq)) {
+        qemu_mutex_unlock(&s->port_lock);
         return -ENOENT;
     }
 
@@ -1638,6 +1877,7 @@ int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap)
 
     /* We can only unmap GSI PIRQs */
     if (gsi < 0) {
+        qemu_mutex_unlock(&s->port_lock);
         return -EINVAL;
     }
 
@@ -1646,6 +1886,12 @@ int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap)
     pirq_inuse_word(s, pirq) &= ~pirq_inuse_bit(pirq);
 
     trace_kvm_xen_unmap_pirq(pirq, gsi);
+    qemu_mutex_unlock(&s->port_lock);
+
+    if (gsi == IRQ_MSI_EMU) {
+        kvm_update_msi_routes_all(NULL, true, 0, 0);
+    }
+
     return 0;
 }
 
@@ -1659,6 +1905,7 @@ int xen_physdev_eoi_pirq(struct physdev_eoi *eoi)
         return -ENOTSUP;
     }
 
+    QEMU_IOTHREAD_LOCK_GUARD();
     QEMU_LOCK_GUARD(&s->port_lock);
 
     if (!pirq_inuse(s, pirq)) {
@@ -1690,6 +1937,7 @@ int xen_physdev_query_pirq(struct physdev_irq_status_query *query)
         return -ENOTSUP;
     }
 
+    QEMU_IOTHREAD_LOCK_GUARD();
     QEMU_LOCK_GUARD(&s->port_lock);
 
     if (!pirq_inuse(s, pirq)) {
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 95400b7..bfb67ac 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -25,6 +25,14 @@ void xen_evtchn_set_callback_level(int level);
 int xen_evtchn_set_port(uint16_t port);
 
 bool xen_evtchn_set_gsi(int gsi, int level);
+void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector,
+                          uint64_t addr, uint32_t data, bool is_masked);
+void xen_evtchn_remove_pci_device(PCIDevice *dev);
+struct kvm_irq_routing_entry;
+int xen_evtchn_translate_pirq_msi(struct kvm_irq_routing_entry *route,
+                                  uint64_t address, uint32_t data);
+bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data);
+
 
 /*
  * These functions mirror the libxenevtchn library API, providing the QEMU
diff --git a/hw/pci/msi.c b/hw/pci/msi.c
index 1cadf15..041b0bd 100644
--- a/hw/pci/msi.c
+++ b/hw/pci/msi.c
@@ -24,6 +24,8 @@
 #include "qemu/range.h"
 #include "qapi/error.h"
 
+#include "hw/i386/kvm/xen_evtchn.h"
+
 /* PCI_MSI_ADDRESS_LO */
 #define PCI_MSI_ADDRESS_LO_MASK         (~0x3)
 
@@ -414,6 +416,15 @@ void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len)
     fprintf(stderr, "\n");
 #endif
 
+    if (xen_mode == XEN_EMULATE) {
+        for (vector = 0; vector < msi_nr_vectors(flags); vector++) {
+            MSIMessage msg = msi_prepare_message(dev, vector);
+
+            xen_evtchn_snoop_msi(dev, false, vector, msg.address, msg.data,
+                                 msi_is_masked(dev, vector));
+        }
+    }
+
     if (!(flags & PCI_MSI_FLAGS_ENABLE)) {
         return;
     }
diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index 9e70fcd..ab8869d 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -26,6 +26,8 @@
 #include "qapi/error.h"
 #include "trace.h"
 
+#include "hw/i386/kvm/xen_evtchn.h"
+
 /* MSI enable bit and maskall bit are in byte 1 in FLAGS register */
 #define MSIX_CONTROL_OFFSET (PCI_MSIX_FLAGS + 1)
 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
@@ -124,6 +126,13 @@ static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked)
 {
     bool is_masked = msix_is_masked(dev, vector);
 
+    if (xen_mode == XEN_EMULATE) {
+        MSIMessage msg = msix_prepare_message(dev, vector);
+
+        xen_evtchn_snoop_msi(dev, true, vector, msg.address, msg.data,
+                             is_masked);
+    }
+
     if (is_masked == was_masked) {
         return;
     }
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index bad8e63..10c980b 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -49,6 +49,9 @@
 #include "qemu/cutils.h"
 #include "pci-internal.h"
 
+#include "hw/xen/xen.h"
+#include "hw/i386/kvm/xen_evtchn.h"
+
 //#define DEBUG_PCI
 #ifdef DEBUG_PCI
 # define PCI_DPRINTF(format, ...)       printf(format, ## __VA_ARGS__)
@@ -319,6 +322,17 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg)
 {
     MemTxAttrs attrs = {};
 
+    /*
+     * Xen uses the high bits of the address to contain some of the bits
+     * of the PIRQ#. Therefore we can't just send the write cycle and
+     * trust that it's caught by the APIC at 0xfee00000 because the
+     * target of the write might be e.g. 0x0x1000fee46000 for PIRQ#4166.
+     * So we intercept the delivery here instead of in kvm_send_msi().
+     */
+    if (xen_mode == XEN_EMULATE &&
+        xen_evtchn_deliver_pirq_msi(msg.address, msg.data)) {
+        return;
+    }
     attrs.requester_id = pci_requester_id(dev);
     address_space_stl_le(&dev->bus_master_as, msg.address, msg.data,
                          attrs, NULL);
@@ -988,6 +1002,9 @@ static void do_pci_unregister_device(PCIDevice *pci_dev)
     pci_get_bus(pci_dev)->devices[pci_dev->devfn] = NULL;
     pci_config_free(pci_dev);
 
+    if (xen_mode == XEN_EMULATE) {
+        xen_evtchn_remove_pci_device(pci_dev);
+    }
     if (memory_region_is_mapped(&pci_dev->bus_master_enable_region)) {
         memory_region_del_subregion(&pci_dev->bus_master_container_region,
                                     &pci_dev->bus_master_enable_region);
-- 
cgit v1.1


From e16aff4cc2ef285d1346abcff1a1752614fa9c60 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 18 Jan 2023 14:36:23 +0000
Subject: kvm/i386: Add xen-evtchn-max-pirq property

The default number of PIRQs is set to 256 to avoid issues with 32-bit MSI
devices. Allow it to be increased if the user desires.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/kvm/xen_evtchn.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 69c0204..886fbf6 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -302,17 +302,18 @@ void xen_evtchn_create(void)
     }
 
     /*
-     * We could parameterise the number of PIRQs available if needed,
-     * but for now limit it to 256. The Xen scheme for encoding PIRQ#
-     * into an MSI message is not compatible with 32-bit MSI, as it
-     * puts the high bits of the PIRQ# into the high bits of the MSI
-     * message address, instead of using the Extended Destination ID
-     * in address bits 4-11 which perhaps would have been a better
-     * choice. So to keep life simple, just stick with 256 as the
-     * default, which conveniently doesn't need to set anything
-     * outside the low 32 bits of the address.
+     * The Xen scheme for encoding PIRQ# into an MSI message is not
+     * compatible with 32-bit MSI, as it puts the high bits of the
+     * PIRQ# into the high bits of the MSI message address, instead of
+     * using the Extended Destination ID in address bits 4-11 which
+     * perhaps would have been a better choice.
+     *
+     * To keep life simple, kvm_accel_instance_init() initialises the
+     * default to 256. which conveniently doesn't need to set anything
+     * outside the low 32 bits of the address. It can be increased by
+     * setting the xen-evtchn-max-pirq property.
      */
-    s->nr_pirqs = 256;
+    s->nr_pirqs = kvm_xen_get_evtchn_max_pirq();
 
     s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64);
     s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words);
-- 
cgit v1.1


From 79807f3e6bf1186c684312d4e7fb426b2643bade Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 15 Feb 2023 16:05:26 +0100
Subject: hw/xen: Subsume xen_be_register_common() into xen_be_init()

Every caller of xen_be_init() checks and exits on error, then calls
xen_be_register_common(). Just make xen_be_init() abort for itself and
return void, and register the common devices too.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
---
 hw/i386/xen/xen-hvm.c       |  8 +------
 hw/xen/xen-legacy-backend.c | 56 ++++++++++++++++++---------------------------
 hw/xenpv/xen_machine_pv.c   |  6 +----
 3 files changed, 24 insertions(+), 46 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index b9a6f7f..e5a1dd1 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -1502,13 +1502,7 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory)
     device_listener_register(&state->device_listener);
 
     xen_bus_init();
-
-    /* Initialize backend core & drivers */
-    if (xen_be_init() != 0) {
-        error_report("xen backend core setup failed");
-        goto err;
-    }
-    xen_be_register_common();
+    xen_be_init();
 
     QLIST_INIT(&xen_physmap);
     xen_read_physmap(state);
diff --git a/hw/xen/xen-legacy-backend.c b/hw/xen/xen-legacy-backend.c
index 085fd31..afba71f 100644
--- a/hw/xen/xen-legacy-backend.c
+++ b/hw/xen/xen-legacy-backend.c
@@ -676,21 +676,30 @@ void xenstore_update_fe(char *watch, struct XenLegacyDevice *xendev)
 }
 /* -------------------------------------------------------------------- */
 
-int xen_be_init(void)
+static void xen_set_dynamic_sysbus(void)
+{
+    Object *machine = qdev_get_machine();
+    ObjectClass *oc = object_get_class(machine);
+    MachineClass *mc = MACHINE_CLASS(oc);
+
+    machine_class_allow_dynamic_sysbus_dev(mc, TYPE_XENSYSDEV);
+}
+
+void xen_be_init(void)
 {
     xengnttab_handle *gnttabdev;
 
     xenstore = xs_daemon_open();
     if (!xenstore) {
         xen_pv_printf(NULL, 0, "can't connect to xenstored\n");
-        return -1;
+        exit(1);
     }
 
     qemu_set_fd_handler(xs_fileno(xenstore), xenstore_update, NULL, NULL);
 
     if (xen_xc == NULL || xen_fmem == NULL) {
-        /* Check if xen_init() have been called */
-        goto err;
+        xen_pv_printf(NULL, 0, "Xen operations not set up\n");
+        exit(1);
     }
 
     gnttabdev = xengnttab_open(NULL, 0);
@@ -706,23 +715,16 @@ int xen_be_init(void)
     xen_sysbus = qbus_new(TYPE_XENSYSBUS, xen_sysdev, "xen-sysbus");
     qbus_set_bus_hotplug_handler(xen_sysbus);
 
-    return 0;
-
-err:
-    qemu_set_fd_handler(xs_fileno(xenstore), NULL, NULL, NULL);
-    xs_daemon_close(xenstore);
-    xenstore = NULL;
-
-    return -1;
-}
-
-static void xen_set_dynamic_sysbus(void)
-{
-    Object *machine = qdev_get_machine();
-    ObjectClass *oc = object_get_class(machine);
-    MachineClass *mc = MACHINE_CLASS(oc);
+    xen_set_dynamic_sysbus();
 
-    machine_class_allow_dynamic_sysbus_dev(mc, TYPE_XENSYSDEV);
+    xen_be_register("console", &xen_console_ops);
+    xen_be_register("vkbd", &xen_kbdmouse_ops);
+#ifdef CONFIG_VIRTFS
+    xen_be_register("9pfs", &xen_9pfs_ops);
+#endif
+#ifdef CONFIG_USB_LIBUSB
+    xen_be_register("qusb", &xen_usb_ops);
+#endif
 }
 
 int xen_be_register(const char *type, struct XenDevOps *ops)
@@ -744,20 +746,6 @@ int xen_be_register(const char *type, struct XenDevOps *ops)
     return xenstore_scan(type, xen_domid, ops);
 }
 
-void xen_be_register_common(void)
-{
-    xen_set_dynamic_sysbus();
-
-    xen_be_register("console", &xen_console_ops);
-    xen_be_register("vkbd", &xen_kbdmouse_ops);
-#ifdef CONFIG_VIRTFS
-    xen_be_register("9pfs", &xen_9pfs_ops);
-#endif
-#ifdef CONFIG_USB_LIBUSB
-    xen_be_register("qusb", &xen_usb_ops);
-#endif
-}
-
 int xen_be_bind_evtchn(struct XenLegacyDevice *xendev)
 {
     if (xendev->local_port != -1) {
diff --git a/hw/xenpv/xen_machine_pv.c b/hw/xenpv/xen_machine_pv.c
index 20c9611..2e759d0 100644
--- a/hw/xenpv/xen_machine_pv.c
+++ b/hw/xenpv/xen_machine_pv.c
@@ -36,10 +36,7 @@ static void xen_init_pv(MachineState *machine)
     int i;
 
     /* Initialize backend core & drivers */
-    if (xen_be_init() != 0) {
-        error_report("%s: xen backend core setup failed", __func__);
-        exit(1);
-    }
+    xen_be_init();
 
     switch (xen_mode) {
     case XEN_ATTACH:
@@ -55,7 +52,6 @@ static void xen_init_pv(MachineState *machine)
         break;
     }
 
-    xen_be_register_common();
     xen_be_register("vfb", &xen_framebuffer_ops);
     xen_be_register("qnic", &xen_netdev_ops);
 
-- 
cgit v1.1


From b97f3147b0a3992ea079d5ce29b755984f30ba4f Mon Sep 17 00:00:00 2001
From: Peter Krempa <pkrempa@redhat.com>
Date: Tue, 28 Feb 2023 09:12:34 +0100
Subject: qapi: Add 'acpi' field to 'query-machines' output

Report which machine types support ACPI so that management applications
can properly use the 'acpi' property even on platforms such as ARM where
support for ACPI depends on the machine type and thus checking presence
of '-machine acpi=' in 'query-command-line-options' is insufficient.

Signed-off-by: Peter Krempa <pkrempa@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <537625d3e25d345052322c42ca19812b98b4f49a.1677571792.git.pkrempa@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/core/machine-qmp-cmds.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'hw')

diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c
index 2d90474..b98ff15 100644
--- a/hw/core/machine-qmp-cmds.c
+++ b/hw/core/machine-qmp-cmds.c
@@ -102,6 +102,7 @@ MachineInfoList *qmp_query_machines(Error **errp)
         info->hotpluggable_cpus = mc->has_hotpluggable_cpus;
         info->numa_mem_supported = mc->numa_mem_supported;
         info->deprecated = !!mc->deprecation_reason;
+        info->acpi = !!object_class_property_find(OBJECT_CLASS(mc), "acpi");
         if (mc->default_cpu_type) {
             info->default_cpu_type = g_strdup(mc->default_cpu_type);
         }
-- 
cgit v1.1