virtio-pci: Use ioeventfd for virtqueue notify

Virtqueue notify is currently handled synchronously in userspace virtio. This prevents the vcpu from executing guest code while hardware emulation code handles the notify. On systems that support KVM, the ioeventfd mechanism can be used to make virtqueue notify a lightweight exit by deferring hardware emulation to the iothread and allowing the VM to continue execution. This model is similar to how vhost receives virtqueue notifies. The result of this change is improved performance for userspace virtio devices. Virtio-blk throughput increases especially for multithreaded scenarios and virtio-net transmit throughput increases substantially. Some virtio devices are known to have guest drivers which expect a notify to be processed synchronously and spin waiting for completion. For virtio-net, this also seems to interact with the guest stack in strange ways so that TCP throughput for small message sizes (~200bytes) is harmed. Only enable ioeventfd for virtio-blk for now. Care must be taken not to interfere with vhost-net, which uses host notifiers. If the set_host_notifier() API is used by a device virtio-pci will disable virtio-ioeventfd and let the device deal with host notifiers as it wishes. Finally, there used to be a limit of 6 KVM io bus devices inside the kernel. On such a kernel, don't use ioeventfd for virtqueue host notification since the limit is reached too easily. This ensures that existing vhost-net setups (which always use ioeventfd) have ioeventfds available so they can continue to work. After migration and on VM change state (running/paused) virtio-ioeventfd will enable/disable itself. * VIRTIO_CONFIG_S_DRIVER_OK -> enable virtio-ioeventfd * !VIRTIO_CONFIG_S_DRIVER_OK -> disable virtio-ioeventfd * virtio_pci_set_host_notifier() -> disable virtio-ioeventfd * vm_change_state(running=0) -> disable virtio-ioeventfd * vm_change_state(running=1) -> enable virtio-ioeventfd Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
author: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> 2010-12-17 12:01:50 +0000
committer: Michael S. Tsirkin <mst@redhat.com> 2011-01-10 14:44:16 +0200
commit: 25db9ebe15125deb32958c6df74996f745edf1f9 (patch)
tree: f1a5117bf6ffe11264f9da55b00ac5f803076269
parent: d2f2b8a740c82319f9eea51ebed50815fbc3da3e (diff)
download: qemu-25db9ebe15125deb32958c6df74996f745edf1f9.zip
qemu-25db9ebe15125deb32958c6df74996f745edf1f9.tar.gz
qemu-25db9ebe15125deb32958c6df74996f745edf1f9.tar.bz2
3 files changed, 184 insertions, 25 deletions
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 13dd391..70c40ee 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -83,6 +83,11 @@
 /* Flags track per-device state like workarounds for quirks in older guests. */
 #define VIRTIO_PCI_FLAG_BUS_MASTER_BUG  (1 << 0)
 
+/* Performance improves when virtqueue kick processing is decoupled from the
+ * vcpu thread using ioeventfd for some devices. */
+#define VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT 1
+#define VIRTIO_PCI_FLAG_USE_IOEVENTFD   (1 << VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT)
+
 /* QEMU doesn't strictly need write barriers since everything runs in
  * lock-step.  We'll leave the calls to wmb() in though to make it obvious for
  * KVM or if kqemu gets SMP support.
@@ -107,6 +112,8 @@ typedef struct {
     /* Max. number of ports we can have for a the virtio-serial device */
     uint32_t max_virtserial_ports;
     virtio_net_conf net;
+    bool ioeventfd_disabled;
+    bool ioeventfd_started;
 } VirtIOPCIProxy;
 
 /* virtio device */
@@ -179,12 +186,132 @@ static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f)
     return 0;
 }
 
+static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
+                                                 int n, bool assign)
+{
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, n);
+    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+    int r;
+    if (assign) {
+        r = event_notifier_init(notifier, 1);
+        if (r < 0) {
+            return r;
+        }
+        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
+                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
+                                       n, assign);
+        if (r < 0) {
+            event_notifier_cleanup(notifier);
+        }
+    } else {
+        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
+                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
+                                       n, assign);
+        if (r < 0) {
+            return r;
+        }
+
+        /* Handle the race condition where the guest kicked and we deassigned
+         * before we got around to handling the kick.
+         */
+        if (event_notifier_test_and_clear(notifier)) {
+            virtio_queue_notify_vq(vq);
+        }
+
+        event_notifier_cleanup(notifier);
+    }
+    return r;
+}
+
+static void virtio_pci_host_notifier_read(void *opaque)
+{
+    VirtQueue *vq = opaque;
+    EventNotifier *n = virtio_queue_get_host_notifier(vq);
+    if (event_notifier_test_and_clear(n)) {
+        virtio_queue_notify_vq(vq);
+    }
+}
+
+static void virtio_pci_set_host_notifier_fd_handler(VirtIOPCIProxy *proxy,
+                                                    int n, bool assign)
+{
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, n);
+    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+    if (assign) {
+        qemu_set_fd_handler(event_notifier_get_fd(notifier),
+                            virtio_pci_host_notifier_read, NULL, vq);
+    } else {
+        qemu_set_fd_handler(event_notifier_get_fd(notifier),
+                            NULL, NULL, NULL);
+    }
+}
+
+static int virtio_pci_start_ioeventfd(VirtIOPCIProxy *proxy)
+{
+    int n, r;
+
+    if (!(proxy->flags & VIRTIO_PCI_FLAG_USE_IOEVENTFD) ||
+        proxy->ioeventfd_disabled ||
+        proxy->ioeventfd_started) {
+        return 0;
+    }
+
+    for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
+        if (!virtio_queue_get_num(proxy->vdev, n)) {
+            continue;
+        }
+
+        r = virtio_pci_set_host_notifier_internal(proxy, n, true);
+        if (r < 0) {
+            goto assign_error;
+        }
+
+        virtio_pci_set_host_notifier_fd_handler(proxy, n, true);
+    }
+    proxy->ioeventfd_started = true;
+    return 0;
+
+assign_error:
+    while (--n >= 0) {
+        if (!virtio_queue_get_num(proxy->vdev, n)) {
+            continue;
+        }
+
+        virtio_pci_set_host_notifier_fd_handler(proxy, n, false);
+        virtio_pci_set_host_notifier_internal(proxy, n, false);
+    }
+    proxy->ioeventfd_started = false;
+    proxy->ioeventfd_disabled = true;
+    return r;
+}
+
+static int virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
+{
+    int n;
+
+    if (!proxy->ioeventfd_started) {
+        return 0;
+    }
+
+    for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
+        if (!virtio_queue_get_num(proxy->vdev, n)) {
+            continue;
+        }
+
+        virtio_pci_set_host_notifier_fd_handler(proxy, n, false);
+        virtio_pci_set_host_notifier_internal(proxy, n, false);
+    }
+    proxy->ioeventfd_started = false;
+    return 0;
+}
+
 static void virtio_pci_reset(DeviceState *d)
 {
     VirtIOPCIProxy *proxy = container_of(d, VirtIOPCIProxy, pci_dev.qdev);
+    virtio_pci_stop_ioeventfd(proxy);
     virtio_reset(proxy->vdev);
     msix_reset(&proxy->pci_dev);
-    proxy->flags = 0;
+    proxy->flags &= ~VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
 }
 
 static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
@@ -209,6 +336,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
     case VIRTIO_PCI_QUEUE_PFN:
         pa = (target_phys_addr_t)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT;
         if (pa == 0) {
+            virtio_pci_stop_ioeventfd(proxy);
             virtio_reset(proxy->vdev);
             msix_unuse_all_vectors(&proxy->pci_dev);
         }
@@ -223,7 +351,16 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         virtio_queue_notify(vdev, val);
         break;
     case VIRTIO_PCI_STATUS:
+        if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
+            virtio_pci_stop_ioeventfd(proxy);
+        }
+
         virtio_set_status(vdev, val & 0xFF);
+
+        if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
+            virtio_pci_start_ioeventfd(proxy);
+        }
+
         if (vdev->status == 0) {
             virtio_reset(proxy->vdev);
             msix_unuse_all_vectors(&proxy->pci_dev);
@@ -403,6 +540,7 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
     if (PCI_COMMAND == address) {
         if (!(val & PCI_COMMAND_MASTER)) {
             if (!(proxy->flags & VIRTIO_PCI_FLAG_BUS_MASTER_BUG)) {
+                virtio_pci_stop_ioeventfd(proxy);
                 virtio_set_status(proxy->vdev,
                                   proxy->vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
             }
@@ -480,30 +618,30 @@ assign_error:
 static int virtio_pci_set_host_notifier(void *opaque, int n, bool assign)
 {
     VirtIOPCIProxy *proxy = opaque;
-    VirtQueue *vq = virtio_get_queue(proxy->vdev, n);
-    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
-    int r;
+
+    /* Stop using ioeventfd for virtqueue kick if the device starts using host
+     * notifiers.  This makes it easy to avoid stepping on each others' toes.
+     */
+    proxy->ioeventfd_disabled = assign;
     if (assign) {
-        r = event_notifier_init(notifier, 1);
-        if (r < 0) {
-            return r;
-        }
-        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
-                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
-                                       n, assign);
-        if (r < 0) {
-            event_notifier_cleanup(notifier);
-        }
+        virtio_pci_stop_ioeventfd(proxy);
+    }
+    /* We don't need to start here: it's not needed because backend
+     * currently only stops on status change away from ok,
+     * reset, vmstop and such. If we do add code to start here,
+     * need to check vmstate, device state etc. */
+    return virtio_pci_set_host_notifier_internal(proxy, n, assign);
+}
+
+static void virtio_pci_vmstate_change(void *opaque, bool running)
+{
+    VirtIOPCIProxy *proxy = opaque;
+
+    if (running) {
+        virtio_pci_start_ioeventfd(proxy);
     } else {
-        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
-                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
-                                       n, assign);
-        if (r < 0) {
-            return r;
-        }
-        event_notifier_cleanup(notifier);
+        virtio_pci_stop_ioeventfd(proxy);
     }
-    return r;
 }
 
 static const VirtIOBindings virtio_pci_bindings = {
@@ -515,6 +653,7 @@ static const VirtIOBindings virtio_pci_bindings = {
     .get_features = virtio_pci_get_features,
     .set_host_notifier = virtio_pci_set_host_notifier,
     .set_guest_notifiers = virtio_pci_set_guest_notifiers,
+    .vmstate_change = virtio_pci_vmstate_change,
 };
 
 static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
@@ -559,10 +698,15 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
     pci_register_bar(&proxy->pci_dev, 0, size, PCI_BASE_ADDRESS_SPACE_IO,
                            virtio_map);
 
+    if (!kvm_has_many_ioeventfds()) {
+        proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
+    }
+
     virtio_bind_device(vdev, &virtio_pci_bindings, proxy);
     proxy->host_features |= 0x1 << VIRTIO_F_NOTIFY_ON_EMPTY;
     proxy->host_features |= 0x1 << VIRTIO_F_BAD_FEATURE;
     proxy->host_features = vdev->get_features(vdev, proxy->host_features);
+
 }
 
 static int virtio_blk_init_pci(PCIDevice *pci_dev)
@@ -597,6 +741,7 @@ static int virtio_blk_exit_pci(PCIDevice *pci_dev)
 {
     VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
 
+    virtio_pci_stop_ioeventfd(proxy);
     virtio_blk_exit(proxy->vdev);
     blockdev_mark_auto_del(proxy->block.bs);
     return virtio_exit_pci(pci_dev);
@@ -658,6 +803,7 @@ static int virtio_net_exit_pci(PCIDevice *pci_dev)
 {
     VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
 
+    virtio_pci_stop_ioeventfd(proxy);
     virtio_net_exit(proxy->vdev);
     return virtio_exit_pci(pci_dev);
 }
@@ -705,6 +851,8 @@ static PCIDeviceInfo virtio_info[] = {
         .qdev.props = (Property[]) {
             DEFINE_PROP_HEX32("class", VirtIOPCIProxy, class_code, 0),
             DEFINE_BLOCK_PROPERTIES(VirtIOPCIProxy, block),
+            DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+                            VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
             DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
             DEFINE_VIRTIO_BLK_FEATURES(VirtIOPCIProxy, host_features),
             DEFINE_PROP_END_OF_LIST(),
@@ -717,6 +865,8 @@ static PCIDeviceInfo virtio_info[] = {
         .exit       = virtio_net_exit_pci,
         .romfile    = "pxe-virtio.bin",
         .qdev.props = (Property[]) {
+            DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+                            VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, false),
             DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
             DEFINE_VIRTIO_NET_FEATURES(VirtIOPCIProxy, host_features),
             DEFINE_NIC_PROPERTIES(VirtIOPCIProxy, nic),
diff --git a/hw/virtio.c b/hw/virtio.c
index 1d20be2..31bd9e3 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -575,11 +575,19 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n)
     return vdev->vq[n].vring.num;
 }
 
+void virtio_queue_notify_vq(VirtQueue *vq)
+{
+    if (vq->vring.desc) {
+        VirtIODevice *vdev = vq->vdev;
+        trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
+        vq->handle_output(vdev, vq);
+    }
+}
+
 void virtio_queue_notify(VirtIODevice *vdev, int n)
 {
-    if (n < VIRTIO_PCI_QUEUE_MAX && vdev->vq[n].vring.desc) {
-        trace_virtio_queue_notify(vdev, n, &vdev->vq[n]);
-        vdev->vq[n].handle_output(vdev, &vdev->vq[n]);
+    if (n < VIRTIO_PCI_QUEUE_MAX) {
+        virtio_queue_notify_vq(&vdev->vq[n]);
     }
 }
 
diff --git a/hw/virtio.h b/hw/virtio.h
index bd52742..d8546d5 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -222,5 +222,6 @@ void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx);
 VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n);
 EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq);
 EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
+void virtio_queue_notify_vq(VirtQueue *vq);
 void virtio_irq(VirtQueue *vq);
 #endif
author	Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>	2010-12-17 12:01:50 +0000
committer	Michael S. Tsirkin <mst@redhat.com>	2011-01-10 14:44:16 +0200
commit	25db9ebe15125deb32958c6df74996f745edf1f9 (patch)
tree	f1a5117bf6ffe11264f9da55b00ac5f803076269
parent	d2f2b8a740c82319f9eea51ebed50815fbc3da3e (diff)
download	qemu-25db9ebe15125deb32958c6df74996f745edf1f9.zip qemu-25db9ebe15125deb32958c6df74996f745edf1f9.tar.gz qemu-25db9ebe15125deb32958c6df74996f745edf1f9.tar.bz2