From dda53ee93af90547912f0ea99ca85deba49fd364 Mon Sep 17 00:00:00 2001
From: Zihan Yang <whois.zihan.yang@gmail.com>
Date: Wed, 25 Apr 2018 17:52:23 +0800
Subject: hw/pci-host/q35: Replace hardcoded value with macro

During smram region initialization some addresses are hardcoded,
replace them with macro to be more clear to readers.

Previous patch forgets about one value and exceeds the line
limit of 90 characters. The v2 breaks a few long lines

Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci-host/q35.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'hw')

diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index a36a119..02f9576 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -535,13 +535,15 @@ static void mch_realize(PCIDevice *d, Error **errp)
 
     /* if *disabled* show SMRAM to all CPUs */
     memory_region_init_alias(&mch->smram_region, OBJECT(mch), "smram-region",
-                             mch->pci_address_space, 0xa0000, 0x20000);
-    memory_region_add_subregion_overlap(mch->system_memory, 0xa0000,
+                             mch->pci_address_space, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                             MCH_HOST_BRIDGE_SMRAM_C_SIZE);
+    memory_region_add_subregion_overlap(mch->system_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
                                         &mch->smram_region, 1);
     memory_region_set_enabled(&mch->smram_region, true);
 
     memory_region_init_alias(&mch->open_high_smram, OBJECT(mch), "smram-open-high",
-                             mch->ram_memory, 0xa0000, 0x20000);
+                             mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                             MCH_HOST_BRIDGE_SMRAM_C_SIZE);
     memory_region_add_subregion_overlap(mch->system_memory, 0xfeda0000,
                                         &mch->open_high_smram, 1);
     memory_region_set_enabled(&mch->open_high_smram, false);
@@ -550,11 +552,14 @@ static void mch_realize(PCIDevice *d, Error **errp)
     memory_region_init(&mch->smram, OBJECT(mch), "smram", 1ull << 32);
     memory_region_set_enabled(&mch->smram, true);
     memory_region_init_alias(&mch->low_smram, OBJECT(mch), "smram-low",
-                             mch->ram_memory, 0xa0000, 0x20000);
+                             mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                             MCH_HOST_BRIDGE_SMRAM_C_SIZE);
     memory_region_set_enabled(&mch->low_smram, true);
-    memory_region_add_subregion(&mch->smram, 0xa0000, &mch->low_smram);
+    memory_region_add_subregion(&mch->smram, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                                &mch->low_smram);
     memory_region_init_alias(&mch->high_smram, OBJECT(mch), "smram-high",
-                             mch->ram_memory, 0xa0000, 0x20000);
+                             mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                             MCH_HOST_BRIDGE_SMRAM_C_SIZE);
     memory_region_set_enabled(&mch->high_smram, true);
     memory_region_add_subregion(&mch->smram, 0xfeda0000, &mch->high_smram);
 
-- 
cgit v1.1


From b7b126442977dcfa5641e38b084bd13a0b366cff Mon Sep 17 00:00:00 2001
From: Jonathan Helman <jonathan.helman@oracle.com>
Date: Mon, 19 Mar 2018 15:28:49 -0700
Subject: virtio-balloon: add hugetlb page allocation counts

qemu should read and report hugetlb page allocation
counts exported in the following kernel patch:

    commit 4c3ca37c4a4394978fd0f005625f6064ed2b9a64
    Author: Jonathan Helman <jonathan.helman@oracle.com>
    Date:   Mon Mar 19 11:00:35 2018 -0700

    virtio_balloon: export hugetlb page allocation counts

    Export the number of successful and failed hugetlb page
    allocations via the virtio balloon driver. These 2 counts
    come directly from the vm_events HTLB_BUDDY_PGALLOC and
    HTLB_BUDDY_PGALLOC_FAIL.

Signed-off-by: Jonathan Helman <jonathan.helman@oracle.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/virtio-balloon.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'hw')

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index f456cea..1f7a87f 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -52,6 +52,8 @@ static const char *balloon_stat_names[] = {
    [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
    [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
    [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
+   [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc",
+   [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail",
    [VIRTIO_BALLOON_S_NR] = NULL
 };
 
-- 
cgit v1.1


From ffcbbe722fcb3c162318cba1b94a115498e25acd Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 27 Apr 2018 17:07:24 +0800
Subject: vhost: add trace for IOTLB miss

Add some trace points for IOTLB translation for vhost. After vhost-user
is setup, the only IO path that QEMU will participate should be the
IOMMU translation, so it'll be good we can track this with explicit
timestamps when needed to see how long time we take to do the
translation, and whether there's anything stuck inside.  It might be
useful for triaging vhost-user problems.

Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/trace-events | 1 +
 hw/virtio/vhost.c      | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'hw')

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 1422ff0..07bcbe9 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -6,6 +6,7 @@ vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t
 vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64
 vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64
 vhost_section(const char *name, int r) "%s:%d"
+vhost_iotlb_miss(void *dev, int step) "%p step %d"
 
 # hw/virtio/vhost-user.c
 vhost_user_postcopy_end_entry(void) ""
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 9d5850a..b082900 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -894,12 +894,15 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
 
     rcu_read_lock();
 
+    trace_vhost_iotlb_miss(dev, 1);
+
     iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
                                           iova, write);
     if (iotlb.target_as != NULL) {
         ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
                                          &uaddr, &len);
         if (ret) {
+            trace_vhost_iotlb_miss(dev, 3);
             error_report("Fail to lookup the translated address "
                          "%"PRIx64, iotlb.translated_addr);
             goto out;
@@ -911,10 +914,14 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
         ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
                                                 len, iotlb.perm);
         if (ret) {
+            trace_vhost_iotlb_miss(dev, 4);
             error_report("Fail to update device iotlb");
             goto out;
         }
     }
+
+    trace_vhost_iotlb_miss(dev, 2);
+
 out:
     rcu_read_unlock();
 
-- 
cgit v1.1


From 1814eab67398029a927847dfd29198a26aeaf7d8 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 17 Apr 2018 21:47:50 +0300
Subject: x86/cpu: use standard-headers/asm-x86.kvm_para.h

Switch to the header we imported from Linux,
this allows us to drop a hack in kvm_i386.h.
More code will be dropped in the next patch.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/kvm/clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'hw')

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 7dac319..0bf1c60 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -26,7 +26,7 @@
 #include "qapi/error.h"
 
 #include <linux/kvm.h>
-#include <linux/kvm_para.h>
+#include "standard-headers/asm-x86/kvm_para.h"
 
 #define TYPE_KVM_CLOCK "kvmclock"
 #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
-- 
cgit v1.1


From 1f3a4519b1c107b5db2434b30638353978366b4d Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.bie@intel.com>
Date: Thu, 12 Apr 2018 23:12:29 +0800
Subject: vhost-user: support receiving file descriptors in slave_read

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

(limited to 'hw')

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 38da869..85d8fd2 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -852,14 +852,44 @@ static void slave_read(void *opaque)
     VhostUserHeader hdr = { 0, };
     VhostUserPayload payload = { 0, };
     int size, ret = 0;
+    struct iovec iov;
+    struct msghdr msgh;
+    int fd = -1;
+    char control[CMSG_SPACE(sizeof(fd))];
+    struct cmsghdr *cmsg;
+    size_t fdsize;
+
+    memset(&msgh, 0, sizeof(msgh));
+    msgh.msg_iov = &iov;
+    msgh.msg_iovlen = 1;
+    msgh.msg_control = control;
+    msgh.msg_controllen = sizeof(control);
 
     /* Read header */
-    size = read(u->slave_fd, &hdr, VHOST_USER_HDR_SIZE);
+    iov.iov_base = &hdr;
+    iov.iov_len = VHOST_USER_HDR_SIZE;
+
+    size = recvmsg(u->slave_fd, &msgh, 0);
     if (size != VHOST_USER_HDR_SIZE) {
         error_report("Failed to read from slave.");
         goto err;
     }
 
+    if (msgh.msg_flags & MSG_CTRUNC) {
+        error_report("Truncated message.");
+        goto err;
+    }
+
+    for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+         cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+            if (cmsg->cmsg_level == SOL_SOCKET &&
+                cmsg->cmsg_type == SCM_RIGHTS) {
+                    fdsize = cmsg->cmsg_len - CMSG_LEN(0);
+                    memcpy(&fd, CMSG_DATA(cmsg), fdsize);
+                    break;
+            }
+    }
+
     if (hdr.size > VHOST_USER_PAYLOAD_SIZE) {
         error_report("Failed to read msg header."
                 " Size %d exceeds the maximum %zu.", hdr.size,
@@ -883,9 +913,15 @@ static void slave_read(void *opaque)
         break;
     default:
         error_report("Received unexpected msg type.");
+        if (fd != -1) {
+            close(fd);
+        }
         ret = -EINVAL;
     }
 
+    /* Message handlers need to make sure that fd will be consumed. */
+    fd = -1;
+
     /*
      * REPLY_ACK feature handling. Other reply types has to be managed
      * directly in their request handlers.
@@ -918,6 +954,9 @@ err:
     qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL);
     close(u->slave_fd);
     u->slave_fd = -1;
+    if (fd != -1) {
+        close(fd);
+    }
     return;
 }
 
-- 
cgit v1.1


From 6f80e6170ede13605817e5c0ca73db0de7bdf261 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.bie@intel.com>
Date: Thu, 12 Apr 2018 23:12:30 +0800
Subject: virtio: support setting memory region based host notifier

This patch introduces the support for setting memory region
based host notifiers for virtio device. This is helpful when
using a hardware accelerator for a virtio device, because
hardware heavily depends on the notification, this will allow
the guest driver in the VM to notify the hardware directly.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-pci.c | 22 ++++++++++++++++++++++
 hw/virtio/virtio.c     | 13 +++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'hw')

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 1e8ab7b..5eb0c32 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1037,6 +1037,27 @@ assign_error:
     return r;
 }
 
+static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n,
+                                           MemoryRegion *mr, bool assign)
+{
+    VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+    int offset;
+
+    if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) ||
+        virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) {
+        return -1;
+    }
+
+    if (assign) {
+        offset = virtio_pci_queue_mem_mult(proxy) * n;
+        memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1);
+    } else {
+        memory_region_del_subregion(&proxy->notify.mr, mr);
+    }
+
+    return 0;
+}
+
 static void virtio_pci_vmstate_change(DeviceState *d, bool running)
 {
     VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -2652,6 +2673,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data)
     k->has_extra_state = virtio_pci_has_extra_state;
     k->query_guest_notifiers = virtio_pci_query_guest_notifiers;
     k->set_guest_notifiers = virtio_pci_set_guest_notifiers;
+    k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr;
     k->vmstate_change = virtio_pci_vmstate_change;
     k->pre_plugged = virtio_pci_pre_plugged;
     k->device_plugged = virtio_pci_device_plugged;
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 006d3d1..1debb01 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2454,6 +2454,19 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq)
     return &vq->host_notifier;
 }
 
+int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n,
+                                      MemoryRegion *mr, bool assign)
+{
+    BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+    if (k->set_host_notifier_mr) {
+        return k->set_host_notifier_mr(qbus->parent, n, mr, assign);
+    }
+
+    return -1;
+}
+
 void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
 {
     g_free(vdev->bus_name);
-- 
cgit v1.1


From 9952e807fd016c95f50372536f1ac65a601be6e4 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Wed, 2 May 2018 11:55:52 +0100
Subject: vhost-user+postcopy: Use qemu_set_nonblock

Use qemu_set_nonblock rather than a simple fcntl; cleaner
and I have no reason to change other flags.

Reported-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'hw')

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 85d8fd2..41cbd8a 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1115,7 +1115,7 @@ static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp)
         error_setg(errp, "%s: Failed to get ufd", __func__);
         return -1;
     }
-    fcntl(ufd, F_SETFL, O_NONBLOCK);
+    qemu_set_nonblock(ufd);
 
     /* register ufd with userfault thread */
     u->postcopy_fd.fd = ufd;
-- 
cgit v1.1


From ebf2a499a5c43d5eaee2b70ab5ae655af49d935c Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Fri, 11 May 2018 21:59:40 -0700
Subject: hw/virtio: Fix brace Werror with clang 6.0.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The warning is

hw/virtio/vhost-user.c:1319:26: error: suggest braces
      around initialization of subobject [-Werror,-Wmissing-braces]
    VhostUserMsg msg = { 0 };
                         ^
                         {}

While the original code is correct, and technically exactly correct
as per ISO C89, both GCC and Clang support plain empty set of braces
as an extension.

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'hw')

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 41cbd8a..ca554d4 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1355,7 +1355,7 @@ static bool vhost_user_requires_shm_log(struct vhost_dev *dev)
 
 static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
 {
-    VhostUserMsg msg = { 0 };
+    VhostUserMsg msg = { };
 
     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
 
-- 
cgit v1.1


From 1a97a478e69534f70ebbed1f27c315dbdc23f94f Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Mon, 21 May 2018 10:32:00 -0600
Subject: nvdimm: fix typo in label-size definition

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Fixes: commit da6789c27c2e ("nvdimm: add a macro for property "label-size"")
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Cc: Haozhong Zhang <haozhong.zhang@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/mem/nvdimm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'hw')

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index acb656b..4087aca 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -89,7 +89,7 @@ static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp)
 
 static void nvdimm_init(Object *obj)
 {
-    object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int",
+    object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
                         nvdimm_get_label_size, nvdimm_set_label_size, NULL,
                         NULL, NULL);
     object_property_add_bool(obj, NVDIMM_UNARMED_PROP,
-- 
cgit v1.1


From 36d2d52bdb45f5b753a61fdaf0fe7891f1f5b61d Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:09 +0800
Subject: intel-iommu: send PSI always even if across PDEs

SECURITY IMPLICATION: without this patch, any guest with both assigned
device and a vIOMMU might encounter stale IO page mappings even if guest
has already unmapped the page, which may lead to guest memory
corruption.  The stale mappings will only be limited to the guest's own
memory range, so it should not affect the host memory or other guests on
the host.

During IOVA page table walking, there is a special case when the PSI
covers one whole PDE (Page Directory Entry, which contains 512 Page
Table Entries) or more.  In the past, we skip that entry and we don't
notify the IOMMU notifiers.  This is not correct.  We should send UNMAP
notification to registered UNMAP notifiers in this case.

For UNMAP only notifiers, this might cause IOTLBs cached in the devices
even if they were already invalid.  For MAP/UNMAP notifiers like
vfio-pci, this will cause stale page mappings.

This special case doesn't trigger often, but it is very easy to be
triggered by nested device assignments, since in that case we'll
possibly map the whole L2 guest RAM region into the device's IOVA
address space (several GBs at least), which is far bigger than normal
kernel driver usages of the device (tens of MBs normally).

Without this patch applied to L1 QEMU, nested device assignment to L2
guests will dump some errors like:

qemu-system-x86_64: VFIO_MAP_DMA: -17
qemu-system-x86_64: vfio_dma_map(0x557305420c30, 0xad000, 0x1000,
                    0x7f89a920d000) = -17 (File exists)

CC: QEMU Stable <qemu-stable@nongnu.org>
Acked-by: Jason Wang <jasowang@redhat.com>
[peterx: rewrite the commit message]
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index fb31de9..b359efd 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -722,6 +722,15 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
 
 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
 
+static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
+                             vtd_page_walk_hook hook_fn, void *private)
+{
+    assert(hook_fn);
+    trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
+                            entry->addr_mask, entry->perm);
+    return hook_fn(entry, private);
+}
+
 /**
  * vtd_page_walk_level - walk over specific level for IOVA range
  *
@@ -781,28 +790,37 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
          */
         entry_valid = read_cur | write_cur;
 
+        entry.target_as = &address_space_memory;
+        entry.iova = iova & subpage_mask;
+        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
+        entry.addr_mask = ~subpage_mask;
+
         if (vtd_is_last_slpte(slpte, level)) {
-            entry.target_as = &address_space_memory;
-            entry.iova = iova & subpage_mask;
             /* NOTE: this is only meaningful if entry_valid == true */
             entry.translated_addr = vtd_get_slpte_addr(slpte, aw);
-            entry.addr_mask = ~subpage_mask;
-            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
             if (!entry_valid && !notify_unmap) {
                 trace_vtd_page_walk_skip_perm(iova, iova_next);
                 goto next;
             }
-            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
-                                    entry.addr_mask, entry.perm);
-            if (hook_fn) {
-                ret = hook_fn(&entry, private);
-                if (ret < 0) {
-                    return ret;
-                }
+            ret = vtd_page_walk_one(&entry, level, hook_fn, private);
+            if (ret < 0) {
+                return ret;
             }
         } else {
             if (!entry_valid) {
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
+                if (notify_unmap) {
+                    /*
+                     * The whole entry is invalid; unmap it all.
+                     * Translated address is meaningless, zero it.
+                     */
+                    entry.translated_addr = 0x0;
+                    ret = vtd_page_walk_one(&entry, level, hook_fn, private);
+                    if (ret < 0) {
+                        return ret;
+                    }
+                } else {
+                    trace_vtd_page_walk_skip_perm(iova, iova_next);
+                }
                 goto next;
             }
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova,
-- 
cgit v1.1


From b4a4ba0d68f50f218ee3957b6638dbee32a5eeef Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:10 +0800
Subject: intel-iommu: remove IntelIOMMUNotifierNode

That is not really necessary.  Removing that node struct and put the
list entry directly into VTDAddressSpace.  It simplfies the code a lot.
Since at it, rename the old notifiers_list into vtd_as_with_notifiers.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 41 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index b359efd..3df9045 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1248,10 +1248,10 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
 
 static void vtd_iommu_replay_all(IntelIOMMUState *s)
 {
-    IntelIOMMUNotifierNode *node;
+    VTDAddressSpace *vtd_as;
 
-    QLIST_FOREACH(node, &s->notifiers_list, next) {
-        memory_region_iommu_replay_all(&node->vtd_as->iommu);
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+        memory_region_iommu_replay_all(&vtd_as->iommu);
     }
 }
 
@@ -1372,7 +1372,6 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
 
 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
 {
-    IntelIOMMUNotifierNode *node;
     VTDContextEntry ce;
     VTDAddressSpace *vtd_as;
 
@@ -1381,8 +1380,7 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
                                 &domain_id);
 
-    QLIST_FOREACH(node, &s->notifiers_list, next) {
-        vtd_as = node->vtd_as;
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                       vtd_as->devfn, &ce) &&
             domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
@@ -1402,12 +1400,11 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                                            uint16_t domain_id, hwaddr addr,
                                            uint8_t am)
 {
-    IntelIOMMUNotifierNode *node;
+    VTDAddressSpace *vtd_as;
     VTDContextEntry ce;
     int ret;
 
-    QLIST_FOREACH(node, &(s->notifiers_list), next) {
-        VTDAddressSpace *vtd_as = node->vtd_as;
+    QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
         ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                        vtd_as->devfn, &ce);
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
@@ -2344,8 +2341,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
 {
     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
     IntelIOMMUState *s = vtd_as->iommu_state;
-    IntelIOMMUNotifierNode *node = NULL;
-    IntelIOMMUNotifierNode *next_node = NULL;
 
     if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) {
         error_report("We need to set caching-mode=1 for intel-iommu to enable "
@@ -2354,21 +2349,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
     }
 
     if (old == IOMMU_NOTIFIER_NONE) {
-        node = g_malloc0(sizeof(*node));
-        node->vtd_as = vtd_as;
-        QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
-        return;
-    }
-
-    /* update notifier node with new flags */
-    QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) {
-        if (node->vtd_as == vtd_as) {
-            if (new == IOMMU_NOTIFIER_NONE) {
-                QLIST_REMOVE(node, next);
-                g_free(node);
-            }
-            return;
-        }
+        QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
+    } else if (new == IOMMU_NOTIFIER_NONE) {
+        QLIST_REMOVE(vtd_as, next);
     }
 }
 
@@ -2838,12 +2821,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 
 static void vtd_address_space_unmap_all(IntelIOMMUState *s)
 {
-    IntelIOMMUNotifierNode *node;
     VTDAddressSpace *vtd_as;
     IOMMUNotifier *n;
 
-    QLIST_FOREACH(node, &s->notifiers_list, next) {
-        vtd_as = node->vtd_as;
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
         IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
             vtd_address_space_unmap(vtd_as, n);
         }
@@ -3088,7 +3069,7 @@ static void vtd_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    QLIST_INIT(&s->notifiers_list);
+    QLIST_INIT(&s->vtd_as_with_notifiers);
     memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
     memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
                           "intel_iommu", DMAR_REG_SIZE);
-- 
cgit v1.1


From 1d9efa73e12ddf361ea997c2d532cc4afa6674d1 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:11 +0800
Subject: intel-iommu: add iommu lock

SECURITY IMPLICATION: this patch fixes a potential race when multiple
threads access the IOMMU IOTLB cache.

Add a per-iommu big lock to protect IOMMU status.  Currently the only
thing to be protected is the IOTLB/context cache, since that can be
accessed even without BQL, e.g., in IO dataplane.

Note that we don't need to protect device page tables since that's fully
controlled by the guest kernel.  However there is still possibility that
malicious drivers will program the device to not obey the rule.  In that
case QEMU can't really do anything useful, instead the guest itself will
be responsible for all uncertainties.

CC: QEMU Stable <qemu-stable@nongnu.org>
Reported-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 56 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 9 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 3df9045..8d4069d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -128,6 +128,16 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
     return new_val;
 }
 
+static inline void vtd_iommu_lock(IntelIOMMUState *s)
+{
+    qemu_mutex_lock(&s->iommu_lock);
+}
+
+static inline void vtd_iommu_unlock(IntelIOMMUState *s)
+{
+    qemu_mutex_unlock(&s->iommu_lock);
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@@ -172,9 +182,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
 }
 
 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
- * IntelIOMMUState to 1.
+ * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
  */
-static void vtd_reset_context_cache(IntelIOMMUState *s)
+static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
 {
     VTDAddressSpace *vtd_as;
     VTDBus *vtd_bus;
@@ -197,12 +207,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s)
     s->context_cache_gen = 1;
 }
 
-static void vtd_reset_iotlb(IntelIOMMUState *s)
+/* Must be called with IOMMU lock held. */
+static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
 {
     assert(s->iotlb);
     g_hash_table_remove_all(s->iotlb);
 }
 
+static void vtd_reset_iotlb(IntelIOMMUState *s)
+{
+    vtd_iommu_lock(s);
+    vtd_reset_iotlb_locked(s);
+    vtd_iommu_unlock(s);
+}
+
 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
                                   uint32_t level)
 {
@@ -215,6 +233,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
     return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
 }
 
+/* Must be called with IOMMU lock held */
 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
                                        hwaddr addr)
 {
@@ -235,6 +254,7 @@ out:
     return entry;
 }
 
+/* Must be with IOMMU lock held */
 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
                              uint16_t domain_id, hwaddr addr, uint64_t slpte,
                              uint8_t access_flags, uint32_t level)
@@ -246,7 +266,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
     trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
     if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
         trace_vtd_iotlb_reset("iotlb exceeds size limit");
-        vtd_reset_iotlb(s);
+        vtd_reset_iotlb_locked(s);
     }
 
     entry->gfn = gfn;
@@ -1106,7 +1126,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
     IntelIOMMUState *s = vtd_as->iommu_state;
     VTDContextEntry ce;
     uint8_t bus_num = pci_bus_num(bus);
-    VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry;
+    VTDContextCacheEntry *cc_entry;
     uint64_t slpte, page_mask;
     uint32_t level;
     uint16_t source_id = vtd_make_source_id(bus_num, devfn);
@@ -1123,6 +1143,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
      */
     assert(!vtd_is_interrupt_addr(addr));
 
+    vtd_iommu_lock(s);
+
+    cc_entry = &vtd_as->context_cache_entry;
+
     /* Try to fetch slpte form IOTLB */
     iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
     if (iotlb_entry) {
@@ -1182,7 +1206,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
          * IOMMU region can be swapped back.
          */
         vtd_pt_enable_fast_path(s, source_id);
-
+        vtd_iommu_unlock(s);
         return true;
     }
 
@@ -1203,6 +1227,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
     vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte,
                      access_flags, level);
 out:
+    vtd_iommu_unlock(s);
     entry->iova = addr & page_mask;
     entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
     entry->addr_mask = ~page_mask;
@@ -1210,6 +1235,7 @@ out:
     return true;
 
 error:
+    vtd_iommu_unlock(s);
     entry->iova = 0;
     entry->translated_addr = 0;
     entry->addr_mask = 0;
@@ -1258,10 +1284,13 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
 static void vtd_context_global_invalidate(IntelIOMMUState *s)
 {
     trace_vtd_inv_desc_cc_global();
+    /* Protects context cache */
+    vtd_iommu_lock(s);
     s->context_cache_gen++;
     if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
-        vtd_reset_context_cache(s);
+        vtd_reset_context_cache_locked(s);
     }
+    vtd_iommu_unlock(s);
     vtd_switch_address_space_all(s);
     /*
      * From VT-d spec 6.5.2.1, a global context entry invalidation
@@ -1313,7 +1342,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
             if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
                 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
                                              VTD_PCI_FUNC(devfn_it));
+                vtd_iommu_lock(s);
                 vtd_as->context_cache_entry.context_cache_gen = 0;
+                vtd_iommu_unlock(s);
                 /*
                  * Do switch address space when needed, in case if the
                  * device passthrough bit is switched.
@@ -1377,8 +1408,10 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
 
     trace_vtd_inv_desc_iotlb_domain(domain_id);
 
+    vtd_iommu_lock(s);
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
                                 &domain_id);
+    vtd_iommu_unlock(s);
 
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
@@ -1426,7 +1459,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
     info.domain_id = domain_id;
     info.addr = addr;
     info.mask = ~((1 << am) - 1);
+    vtd_iommu_lock(s);
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+    vtd_iommu_unlock(s);
     vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
 }
 
@@ -2929,8 +2964,10 @@ static void vtd_init(IntelIOMMUState *s)
         s->cap |= VTD_CAP_CM;
     }
 
-    vtd_reset_context_cache(s);
-    vtd_reset_iotlb(s);
+    vtd_iommu_lock(s);
+    vtd_reset_context_cache_locked(s);
+    vtd_reset_iotlb_locked(s);
+    vtd_iommu_unlock(s);
 
     /* Define registers with default values and bit semantics */
     vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
@@ -3070,6 +3107,7 @@ static void vtd_realize(DeviceState *dev, Error **errp)
     }
 
     QLIST_INIT(&s->vtd_as_with_notifiers);
+    qemu_mutex_init(&s->iommu_lock);
     memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
     memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
                           "intel_iommu", DMAR_REG_SIZE);
-- 
cgit v1.1


From 4f8a62a933a79094e44bc1b16b63bb23e62d67b4 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:12 +0800
Subject: intel-iommu: only do page walk for MAP notifiers

For UNMAP-only IOMMU notifiers, we don't need to walk the page tables.
Fasten that procedure by skipping the page table walk.  That should
boost performance for UNMAP-only notifiers like vhost.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 8d4069d..38ccc74 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -138,6 +138,12 @@ static inline void vtd_iommu_unlock(IntelIOMMUState *s)
     qemu_mutex_unlock(&s->iommu_lock);
 }
 
+/* Whether the address space needs to notify new mappings */
+static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
+{
+    return as->notifier_flags & IOMMU_NOTIFIER_MAP;
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@@ -1436,14 +1442,36 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
     VTDAddressSpace *vtd_as;
     VTDContextEntry ce;
     int ret;
+    hwaddr size = (1 << am) * VTD_PAGE_SIZE;
 
     QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
         ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                        vtd_as->devfn, &ce);
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
-            vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
-                          vtd_page_invalidate_notify_hook,
-                          (void *)&vtd_as->iommu, true, s->aw_bits);
+            if (vtd_as_has_map_notifier(vtd_as)) {
+                /*
+                 * As long as we have MAP notifications registered in
+                 * any of our IOMMU notifiers, we need to sync the
+                 * shadow page table.
+                 */
+                vtd_page_walk(&ce, addr, addr + size,
+                              vtd_page_invalidate_notify_hook,
+                              (void *)&vtd_as->iommu, true, s->aw_bits);
+            } else {
+                /*
+                 * For UNMAP-only notifiers, we don't need to walk the
+                 * page tables.  We just deliver the PSI down to
+                 * invalidate caches.
+                 */
+                IOMMUTLBEntry entry = {
+                    .target_as = &address_space_memory,
+                    .iova = addr,
+                    .translated_addr = 0,
+                    .addr_mask = size - 1,
+                    .perm = IOMMU_NONE,
+                };
+                memory_region_notify_iommu(&vtd_as->iommu, entry);
+            }
         }
     }
 }
@@ -2383,6 +2411,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
         exit(1);
     }
 
+    /* Update per-address-space notifier flags */
+    vtd_as->notifier_flags = new;
+
     if (old == IOMMU_NOTIFIER_NONE) {
         QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
     } else if (new == IOMMU_NOTIFIER_NONE) {
@@ -2891,8 +2922,11 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                                   PCI_FUNC(vtd_as->devfn),
                                   VTD_CONTEXT_ENTRY_DID(ce.hi),
                                   ce.hi, ce.lo);
-        vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false,
-                      s->aw_bits);
+        if (vtd_as_has_map_notifier(vtd_as)) {
+            /* This is required only for MAP typed notifiers */
+            vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false,
+                          s->aw_bits);
+        }
     } else {
         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
                                     PCI_FUNC(vtd_as->devfn));
-- 
cgit v1.1


From fe215b0cbb8c1f4b4af0a64aa5c02042080dd537 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:13 +0800
Subject: intel-iommu: introduce vtd_page_walk_info

During the recursive page walking of IOVA page tables, some stack
variables are constant variables and never changed during the whole page
walking procedure.  Isolate them into a struct so that we don't need to
pass those contants down the stack every time and multiple times.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 84 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 32 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 38ccc74..e247269 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -748,9 +748,27 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
 
 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
 
+/**
+ * Constant information used during page walking
+ *
+ * @hook_fn: hook func to be called when detected page
+ * @private: private data to be passed into hook func
+ * @notify_unmap: whether we should notify invalid entries
+ * @aw: maximum address width
+ */
+typedef struct {
+    vtd_page_walk_hook hook_fn;
+    void *private;
+    bool notify_unmap;
+    uint8_t aw;
+} vtd_page_walk_info;
+
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
-                             vtd_page_walk_hook hook_fn, void *private)
+                             vtd_page_walk_info *info)
 {
+    vtd_page_walk_hook hook_fn = info->hook_fn;
+    void *private = info->private;
+
     assert(hook_fn);
     trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
                             entry->addr_mask, entry->perm);
@@ -763,17 +781,13 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
  * @addr: base GPA addr to start the walk
  * @start: IOVA range start address
  * @end: IOVA range end address (start <= addr < end)
- * @hook_fn: hook func to be called when detected page
- * @private: private data to be passed into hook func
  * @read: whether parent level has read permission
  * @write: whether parent level has write permission
- * @notify_unmap: whether we should notify invalid entries
- * @aw: maximum address width
+ * @info: constant information for the page walk
  */
 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
-                               uint64_t end, vtd_page_walk_hook hook_fn,
-                               void *private, uint32_t level, bool read,
-                               bool write, bool notify_unmap, uint8_t aw)
+                               uint64_t end, uint32_t level, bool read,
+                               bool write, vtd_page_walk_info *info)
 {
     bool read_cur, write_cur, entry_valid;
     uint32_t offset;
@@ -823,24 +837,24 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
 
         if (vtd_is_last_slpte(slpte, level)) {
             /* NOTE: this is only meaningful if entry_valid == true */
-            entry.translated_addr = vtd_get_slpte_addr(slpte, aw);
-            if (!entry_valid && !notify_unmap) {
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+            if (!entry_valid && !info->notify_unmap) {
                 trace_vtd_page_walk_skip_perm(iova, iova_next);
                 goto next;
             }
-            ret = vtd_page_walk_one(&entry, level, hook_fn, private);
+            ret = vtd_page_walk_one(&entry, level, info);
             if (ret < 0) {
                 return ret;
             }
         } else {
             if (!entry_valid) {
-                if (notify_unmap) {
+                if (info->notify_unmap) {
                     /*
                      * The whole entry is invalid; unmap it all.
                      * Translated address is meaningless, zero it.
                      */
                     entry.translated_addr = 0x0;
-                    ret = vtd_page_walk_one(&entry, level, hook_fn, private);
+                    ret = vtd_page_walk_one(&entry, level, info);
                     if (ret < 0) {
                         return ret;
                     }
@@ -849,10 +863,9 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
                 }
                 goto next;
             }
-            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova,
-                                      MIN(iova_next, end), hook_fn, private,
-                                      level - 1, read_cur, write_cur,
-                                      notify_unmap, aw);
+            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
+                                      iova, MIN(iova_next, end), level - 1,
+                                      read_cur, write_cur, info);
             if (ret < 0) {
                 return ret;
             }
@@ -871,28 +884,24 @@ next:
  * @ce: context entry to walk upon
  * @start: IOVA address to start the walk
  * @end: IOVA range end address (start <= addr < end)
- * @hook_fn: the hook that to be called for each detected area
- * @private: private data for the hook function
- * @aw: maximum address width
+ * @info: page walking information struct
  */
 static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
-                         vtd_page_walk_hook hook_fn, void *private,
-                         bool notify_unmap, uint8_t aw)
+                         vtd_page_walk_info *info)
 {
     dma_addr_t addr = vtd_ce_get_slpt_base(ce);
     uint32_t level = vtd_ce_get_level(ce);
 
-    if (!vtd_iova_range_check(start, ce, aw)) {
+    if (!vtd_iova_range_check(start, ce, info->aw)) {
         return -VTD_FR_ADDR_BEYOND_MGAW;
     }
 
-    if (!vtd_iova_range_check(end, ce, aw)) {
+    if (!vtd_iova_range_check(end, ce, info->aw)) {
         /* Fix end so that it reaches the maximum */
-        end = vtd_iova_limit(ce, aw);
+        end = vtd_iova_limit(ce, info->aw);
     }
 
-    return vtd_page_walk_level(addr, start, end, hook_fn, private,
-                               level, true, true, notify_unmap, aw);
+    return vtd_page_walk_level(addr, start, end, level, true, true, info);
 }
 
 /* Map a device to its corresponding domain (context-entry) */
@@ -1449,14 +1458,19 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                                        vtd_as->devfn, &ce);
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
             if (vtd_as_has_map_notifier(vtd_as)) {
+                vtd_page_walk_info info = {
+                    .hook_fn = vtd_page_invalidate_notify_hook,
+                    .private = (void *)&vtd_as->iommu,
+                    .notify_unmap = true,
+                    .aw = s->aw_bits,
+                };
+
                 /*
                  * As long as we have MAP notifications registered in
                  * any of our IOMMU notifiers, we need to sync the
                  * shadow page table.
                  */
-                vtd_page_walk(&ce, addr, addr + size,
-                              vtd_page_invalidate_notify_hook,
-                              (void *)&vtd_as->iommu, true, s->aw_bits);
+                vtd_page_walk(&ce, addr, addr + size, &info);
             } else {
                 /*
                  * For UNMAP-only notifiers, we don't need to walk the
@@ -2924,8 +2938,14 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                                   ce.hi, ce.lo);
         if (vtd_as_has_map_notifier(vtd_as)) {
             /* This is required only for MAP typed notifiers */
-            vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false,
-                          s->aw_bits);
+            vtd_page_walk_info info = {
+                .hook_fn = vtd_replay_hook,
+                .private = (void *)n,
+                .notify_unmap = false,
+                .aw = s->aw_bits,
+            };
+
+            vtd_page_walk(&ce, 0, ~0ULL, &info);
         }
     } else {
         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
-- 
cgit v1.1


From 2f764fa87d2a81812b313dd6d998e10126292653 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:14 +0800
Subject: intel-iommu: pass in address space when page walk

We pass in the VTDAddressSpace too.  It'll be used in the follow up
patches.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'hw')

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e247269..a882894 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -754,9 +754,11 @@ typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
  * @hook_fn: hook func to be called when detected page
  * @private: private data to be passed into hook func
  * @notify_unmap: whether we should notify invalid entries
+ * @as: VT-d address space of the device
  * @aw: maximum address width
  */
 typedef struct {
+    VTDAddressSpace *as;
     vtd_page_walk_hook hook_fn;
     void *private;
     bool notify_unmap;
@@ -1463,6 +1465,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                     .private = (void *)&vtd_as->iommu,
                     .notify_unmap = true,
                     .aw = s->aw_bits,
+                    .as = vtd_as,
                 };
 
                 /*
@@ -2943,6 +2946,7 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                 .private = (void *)n,
                 .notify_unmap = false,
                 .aw = s->aw_bits,
+                .as = vtd_as,
             };
 
             vtd_page_walk(&ce, 0, ~0ULL, &info);
-- 
cgit v1.1


From d118c06ebbee2d23ddf873cae4a809311aa61310 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:15 +0800
Subject: intel-iommu: trace domain id during page walk

This patch only modifies the trace points.

Previously we were tracing page walk levels.  They are redundant since
we have page mask (size) already.  Now we trace something much more
useful which is the domain ID of the page walking.  That can be very
useful when we trace more than one devices on the same system, so that
we can know which map is for which domain.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 16 ++++++++++------
 hw/i386/trace-events  |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a882894..61bb3d3 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -756,6 +756,7 @@ typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
  * @notify_unmap: whether we should notify invalid entries
  * @as: VT-d address space of the device
  * @aw: maximum address width
+ * @domain: domain ID of the page walk
  */
 typedef struct {
     VTDAddressSpace *as;
@@ -763,17 +764,18 @@ typedef struct {
     void *private;
     bool notify_unmap;
     uint8_t aw;
+    uint16_t domain_id;
 } vtd_page_walk_info;
 
-static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
-                             vtd_page_walk_info *info)
+static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
 {
     vtd_page_walk_hook hook_fn = info->hook_fn;
     void *private = info->private;
 
     assert(hook_fn);
-    trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
-                            entry->addr_mask, entry->perm);
+    trace_vtd_page_walk_one(info->domain_id, entry->iova,
+                            entry->translated_addr, entry->addr_mask,
+                            entry->perm);
     return hook_fn(entry, private);
 }
 
@@ -844,7 +846,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
                 trace_vtd_page_walk_skip_perm(iova, iova_next);
                 goto next;
             }
-            ret = vtd_page_walk_one(&entry, level, info);
+            ret = vtd_page_walk_one(&entry, info);
             if (ret < 0) {
                 return ret;
             }
@@ -856,7 +858,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
                      * Translated address is meaningless, zero it.
                      */
                     entry.translated_addr = 0x0;
-                    ret = vtd_page_walk_one(&entry, level, info);
+                    ret = vtd_page_walk_one(&entry, info);
                     if (ret < 0) {
                         return ret;
                     }
@@ -1466,6 +1468,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                     .notify_unmap = true,
                     .aw = s->aw_bits,
                     .as = vtd_as,
+                    .domain_id = domain_id,
                 };
 
                 /*
@@ -2947,6 +2950,7 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                 .notify_unmap = false,
                 .aw = s->aw_bits,
                 .as = vtd_as,
+                .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi),
             };
 
             vtd_page_walk(&ce, 0, ~0ULL, &info);
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 22d4464..ca23ba9 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -39,7 +39,7 @@ vtd_fault_disabled(void) "Fault processing disabled for context entry"
 vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
-vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
 vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
-- 
cgit v1.1


From 63b88968f139b6a77f2f81e6f1eedf70c0170a85 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:17 +0800
Subject: intel-iommu: rework the page walk logic

This patch fixes a potential small window that the DMA page table might
be incomplete or invalid when the guest sends domain/context
invalidations to a device.  This can cause random DMA errors for
assigned devices.

This is a major change to the VT-d shadow page walking logic. It
includes but is not limited to:

- For each VTDAddressSpace, now we maintain what IOVA ranges we have
  mapped and what we have not.  With that information, now we only send
  MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
  know we have already mapped the range, meanwhile we don't send UNMAP
  notifies if we know we never mapped the range at all.

- Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call
  in any places to resync the shadow page table for a device.

- When we receive domain/context invalidation, we should not really run
  the replay logic, instead we use the new sync shadow page table API to
  resync the whole shadow page table without unmapping the whole
  region.  After this change, we'll only do the page walk once for each
  domain invalidations (before this, it can be multiple, depending on
  number of notifiers per address space).

While at it, the page walking logic is also refactored to be simpler.

CC: QEMU Stable <qemu-stable@nongnu.org>
Reported-by: Jintack Lim <jintack@cs.columbia.edu>
Tested-by: Jintack Lim <jintack@cs.columbia.edu>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 213 ++++++++++++++++++++++++++++++++++++--------------
 hw/i386/trace-events  |   3 +-
 2 files changed, 157 insertions(+), 59 deletions(-)

(limited to 'hw')

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 61bb3d3..b5a09b7 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -769,10 +769,77 @@ typedef struct {
 
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
 {
+    VTDAddressSpace *as = info->as;
     vtd_page_walk_hook hook_fn = info->hook_fn;
     void *private = info->private;
+    DMAMap target = {
+        .iova = entry->iova,
+        .size = entry->addr_mask,
+        .translated_addr = entry->translated_addr,
+        .perm = entry->perm,
+    };
+    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
+        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+        return 0;
+    }
 
     assert(hook_fn);
+
+    /* Update local IOVA mapped ranges */
+    if (entry->perm) {
+        if (mapped) {
+            /* If it's exactly the same translation, skip */
+            if (!memcmp(mapped, &target, sizeof(target))) {
+                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
+                                                 entry->translated_addr);
+                return 0;
+            } else {
+                /*
+                 * Translation changed.  Normally this should not
+                 * happen, but it can happen when with buggy guest
+                 * OSes.  Note that there will be a small window that
+                 * we don't have map at all.  But that's the best
+                 * effort we can do.  The ideal way to emulate this is
+                 * atomically modify the PTE to follow what has
+                 * changed, but we can't.  One example is that vfio
+                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
+                 * interface to modify a mapping (meanwhile it seems
+                 * meaningless to even provide one).  Anyway, let's
+                 * mark this as a TODO in case one day we'll have
+                 * a better solution.
+                 */
+                IOMMUAccessFlags cache_perm = entry->perm;
+                int ret;
+
+                /* Emulate an UNMAP */
+                entry->perm = IOMMU_NONE;
+                trace_vtd_page_walk_one(info->domain_id,
+                                        entry->iova,
+                                        entry->translated_addr,
+                                        entry->addr_mask,
+                                        entry->perm);
+                ret = hook_fn(entry, private);
+                if (ret) {
+                    return ret;
+                }
+                /* Drop any existing mapping */
+                iova_tree_remove(as->iova_tree, &target);
+                /* Recover the correct permission */
+                entry->perm = cache_perm;
+            }
+        }
+        iova_tree_insert(as->iova_tree, &target);
+    } else {
+        if (!mapped) {
+            /* Skip since we didn't map this range at all */
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+            return 0;
+        }
+        iova_tree_remove(as->iova_tree, &target);
+    }
+
     trace_vtd_page_walk_one(info->domain_id, entry->iova,
                             entry->translated_addr, entry->addr_mask,
                             entry->perm);
@@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
          */
         entry_valid = read_cur | write_cur;
 
-        entry.target_as = &address_space_memory;
-        entry.iova = iova & subpage_mask;
-        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
-        entry.addr_mask = ~subpage_mask;
-
-        if (vtd_is_last_slpte(slpte, level)) {
-            /* NOTE: this is only meaningful if entry_valid == true */
-            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
-            if (!entry_valid && !info->notify_unmap) {
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
-                goto next;
-            }
-            ret = vtd_page_walk_one(&entry, info);
-            if (ret < 0) {
-                return ret;
-            }
-        } else {
-            if (!entry_valid) {
-                if (info->notify_unmap) {
-                    /*
-                     * The whole entry is invalid; unmap it all.
-                     * Translated address is meaningless, zero it.
-                     */
-                    entry.translated_addr = 0x0;
-                    ret = vtd_page_walk_one(&entry, info);
-                    if (ret < 0) {
-                        return ret;
-                    }
-                } else {
-                    trace_vtd_page_walk_skip_perm(iova, iova_next);
-                }
-                goto next;
-            }
+        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
+            /*
+             * This is a valid PDE (or even bigger than PDE).  We need
+             * to walk one further level.
+             */
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
                                       iova, MIN(iova_next, end), level - 1,
                                       read_cur, write_cur, info);
-            if (ret < 0) {
-                return ret;
-            }
+        } else {
+            /*
+             * This means we are either:
+             *
+             * (1) the real page entry (either 4K page, or huge page)
+             * (2) the whole range is invalid
+             *
+             * In either case, we send an IOTLB notification down.
+             */
+            entry.target_as = &address_space_memory;
+            entry.iova = iova & subpage_mask;
+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
+            entry.addr_mask = ~subpage_mask;
+            /* NOTE: this is only meaningful if entry_valid == true */
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+            ret = vtd_page_walk_one(&entry, info);
+        }
+
+        if (ret < 0) {
+            return ret;
         }
 
 next:
@@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     return 0;
 }
 
+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
+                                     void *private)
+{
+    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
+    return 0;
+}
+
+/* If context entry is NULL, we'll try to fetch it on our own. */
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
+                                            VTDContextEntry *ce,
+                                            hwaddr addr, hwaddr size)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    vtd_page_walk_info info = {
+        .hook_fn = vtd_sync_shadow_page_hook,
+        .private = (void *)&vtd_as->iommu,
+        .notify_unmap = true,
+        .aw = s->aw_bits,
+        .as = vtd_as,
+    };
+    VTDContextEntry ce_cache;
+    int ret;
+
+    if (ce) {
+        /* If the caller provided context entry, use it */
+        ce_cache = *ce;
+    } else {
+        /* If the caller didn't provide ce, try to fetch */
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+                                       vtd_as->devfn, &ce_cache);
+        if (ret) {
+            /*
+             * This should not really happen, but in case it happens,
+             * we just skip the sync for this time.  After all we even
+             * don't have the root table pointer!
+             */
+            trace_vtd_err("Detected invalid context entry when "
+                          "trying to sync shadow page table");
+            return 0;
+        }
+    }
+
+    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
+
+    return vtd_page_walk(&ce_cache, addr, addr + size, &info);
+}
+
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
+{
+    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
+}
+
 /*
  * Fetch translation type for specific device. Returns <0 if error
  * happens, otherwise return the shifted type to check against
@@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
     VTDAddressSpace *vtd_as;
 
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
-        memory_region_iommu_replay_all(&vtd_as->iommu);
+        vtd_sync_shadow_page_table(vtd_as);
     }
 }
 
@@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
                 vtd_switch_address_space(vtd_as);
                 /*
                  * So a device is moving out of (or moving into) a
-                 * domain, a replay() suites here to notify all the
-                 * IOMMU_NOTIFIER_MAP registers about this change.
+                 * domain, resync the shadow page table.
                  * This won't bring bad even if we have no such
                  * notifier registered - the IOMMU notification
                  * framework will skip MAP notifications if that
                  * happened.
                  */
-                memory_region_iommu_replay_all(&vtd_as->iommu);
+                vtd_sync_shadow_page_table(vtd_as);
             }
         }
     }
@@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                       vtd_as->devfn, &ce) &&
             domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
-            memory_region_iommu_replay_all(&vtd_as->iommu);
+            vtd_sync_shadow_page_table(vtd_as);
         }
     }
 }
 
-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
-                                           void *private)
-{
-    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
-    return 0;
-}
-
 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                                            uint16_t domain_id, hwaddr addr,
                                            uint8_t am)
@@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                                        vtd_as->devfn, &ce);
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
             if (vtd_as_has_map_notifier(vtd_as)) {
-                vtd_page_walk_info info = {
-                    .hook_fn = vtd_page_invalidate_notify_hook,
-                    .private = (void *)&vtd_as->iommu,
-                    .notify_unmap = true,
-                    .aw = s->aw_bits,
-                    .as = vtd_as,
-                    .domain_id = domain_id,
-                };
-
                 /*
                  * As long as we have MAP notifications registered in
                  * any of our IOMMU notifiers, we need to sync the
                  * shadow page table.
                  */
-                vtd_page_walk(&ce, addr, addr + size, &info);
+                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
             } else {
                 /*
                  * For UNMAP-only notifiers, we don't need to walk the
@@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
         vtd_dev_as->devfn = (uint8_t)devfn;
         vtd_dev_as->iommu_state = s;
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+        vtd_dev_as->iova_tree = iova_tree_new();
 
         /*
          * Memory region relationships looks like (Address range shows
@@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
     hwaddr start = n->start;
     hwaddr end = n->end;
     IntelIOMMUState *s = as->iommu_state;
+    DMAMap map;
 
     /*
      * Note: all the codes in this function has a assumption that IOVA
@@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
                              VTD_PCI_FUNC(as->devfn),
                              entry.iova, size);
 
+    map.iova = entry.iova;
+    map.size = entry.addr_mask;
+    iova_tree_remove(as->iova_tree, &map);
+
     memory_region_notify_one(n, &entry);
 }
 
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index ca23ba9..e14d06e 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
-- 
cgit v1.1