From dda53ee93af90547912f0ea99ca85deba49fd364 Mon Sep 17 00:00:00 2001 From: Zihan Yang Date: Wed, 25 Apr 2018 17:52:23 +0800 Subject: hw/pci-host/q35: Replace hardcoded value with macro During smram region initialization some addresses are hardcoded, replace them with macro to be more clear to readers. Previous patch forgets about one value and exceeds the line limit of 90 characters. The v2 breaks a few long lines Signed-off-by: Zihan Yang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci-host/q35.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'hw') diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index a36a119..02f9576 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -535,13 +535,15 @@ static void mch_realize(PCIDevice *d, Error **errp) /* if *disabled* show SMRAM to all CPUs */ memory_region_init_alias(&mch->smram_region, OBJECT(mch), "smram-region", - mch->pci_address_space, 0xa0000, 0x20000); - memory_region_add_subregion_overlap(mch->system_memory, 0xa0000, + mch->pci_address_space, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); + memory_region_add_subregion_overlap(mch->system_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, &mch->smram_region, 1); memory_region_set_enabled(&mch->smram_region, true); memory_region_init_alias(&mch->open_high_smram, OBJECT(mch), "smram-open-high", - mch->ram_memory, 0xa0000, 0x20000); + mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); memory_region_add_subregion_overlap(mch->system_memory, 0xfeda0000, &mch->open_high_smram, 1); memory_region_set_enabled(&mch->open_high_smram, false); @@ -550,11 +552,14 @@ static void mch_realize(PCIDevice *d, Error **errp) memory_region_init(&mch->smram, OBJECT(mch), "smram", 1ull << 32); memory_region_set_enabled(&mch->smram, true); memory_region_init_alias(&mch->low_smram, OBJECT(mch), "smram-low", - mch->ram_memory, 0xa0000, 0x20000); + mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); memory_region_set_enabled(&mch->low_smram, true); - memory_region_add_subregion(&mch->smram, 0xa0000, &mch->low_smram); + memory_region_add_subregion(&mch->smram, MCH_HOST_BRIDGE_SMRAM_C_BASE, + &mch->low_smram); memory_region_init_alias(&mch->high_smram, OBJECT(mch), "smram-high", - mch->ram_memory, 0xa0000, 0x20000); + mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); memory_region_set_enabled(&mch->high_smram, true); memory_region_add_subregion(&mch->smram, 0xfeda0000, &mch->high_smram); -- cgit v1.1 From b7b126442977dcfa5641e38b084bd13a0b366cff Mon Sep 17 00:00:00 2001 From: Jonathan Helman Date: Mon, 19 Mar 2018 15:28:49 -0700 Subject: virtio-balloon: add hugetlb page allocation counts qemu should read and report hugetlb page allocation counts exported in the following kernel patch: commit 4c3ca37c4a4394978fd0f005625f6064ed2b9a64 Author: Jonathan Helman Date: Mon Mar 19 11:00:35 2018 -0700 virtio_balloon: export hugetlb page allocation counts Export the number of successful and failed hugetlb page allocations via the virtio balloon driver. These 2 counts come directly from the vm_events HTLB_BUDDY_PGALLOC and HTLB_BUDDY_PGALLOC_FAIL. Signed-off-by: Jonathan Helman Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jason Wang --- hw/virtio/virtio-balloon.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'hw') diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index f456cea..1f7a87f 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -52,6 +52,8 @@ static const char *balloon_stat_names[] = { [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory", [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory", [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches", + [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc", + [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail", [VIRTIO_BALLOON_S_NR] = NULL }; -- cgit v1.1 From ffcbbe722fcb3c162318cba1b94a115498e25acd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 27 Apr 2018 17:07:24 +0800 Subject: vhost: add trace for IOTLB miss Add some trace points for IOTLB translation for vhost. After vhost-user is setup, the only IO path that QEMU will participate should be the IOMMU translation, so it'll be good we can track this with explicit timestamps when needed to see how long time we take to do the translation, and whether there's anything stuck inside. It might be useful for triaging vhost-user problems. Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 1 + hw/virtio/vhost.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'hw') diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 1422ff0..07bcbe9 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -6,6 +6,7 @@ vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64 vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64 vhost_section(const char *name, int r) "%s:%d" +vhost_iotlb_miss(void *dev, int step) "%p step %d" # hw/virtio/vhost-user.c vhost_user_postcopy_end_entry(void) "" diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 9d5850a..b082900 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -894,12 +894,15 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) rcu_read_lock(); + trace_vhost_iotlb_miss(dev, 1); + iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, iova, write); if (iotlb.target_as != NULL) { ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, &uaddr, &len); if (ret) { + trace_vhost_iotlb_miss(dev, 3); error_report("Fail to lookup the translated address " "%"PRIx64, iotlb.translated_addr); goto out; @@ -911,10 +914,14 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, iotlb.perm); if (ret) { + trace_vhost_iotlb_miss(dev, 4); error_report("Fail to update device iotlb"); goto out; } } + + trace_vhost_iotlb_miss(dev, 2); + out: rcu_read_unlock(); -- cgit v1.1 From 1814eab67398029a927847dfd29198a26aeaf7d8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 17 Apr 2018 21:47:50 +0300 Subject: x86/cpu: use standard-headers/asm-x86.kvm_para.h Switch to the header we imported from Linux, this allows us to drop a hack in kvm_i386.h. More code will be dropped in the next patch. Signed-off-by: Michael S. Tsirkin --- hw/i386/kvm/clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 7dac319..0bf1c60 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -26,7 +26,7 @@ #include "qapi/error.h" #include -#include +#include "standard-headers/asm-x86/kvm_para.h" #define TYPE_KVM_CLOCK "kvmclock" #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK) -- cgit v1.1 From 1f3a4519b1c107b5db2434b30638353978366b4d Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 12 Apr 2018 23:12:29 +0800 Subject: vhost-user: support receiving file descriptors in slave_read Signed-off-by: Tiwei Bie Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 38da869..85d8fd2 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -852,14 +852,44 @@ static void slave_read(void *opaque) VhostUserHeader hdr = { 0, }; VhostUserPayload payload = { 0, }; int size, ret = 0; + struct iovec iov; + struct msghdr msgh; + int fd = -1; + char control[CMSG_SPACE(sizeof(fd))]; + struct cmsghdr *cmsg; + size_t fdsize; + + memset(&msgh, 0, sizeof(msgh)); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); /* Read header */ - size = read(u->slave_fd, &hdr, VHOST_USER_HDR_SIZE); + iov.iov_base = &hdr; + iov.iov_len = VHOST_USER_HDR_SIZE; + + size = recvmsg(u->slave_fd, &msgh, 0); if (size != VHOST_USER_HDR_SIZE) { error_report("Failed to read from slave."); goto err; } + if (msgh.msg_flags & MSG_CTRUNC) { + error_report("Truncated message."); + goto err; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + fdsize = cmsg->cmsg_len - CMSG_LEN(0); + memcpy(&fd, CMSG_DATA(cmsg), fdsize); + break; + } + } + if (hdr.size > VHOST_USER_PAYLOAD_SIZE) { error_report("Failed to read msg header." " Size %d exceeds the maximum %zu.", hdr.size, @@ -883,9 +913,15 @@ static void slave_read(void *opaque) break; default: error_report("Received unexpected msg type."); + if (fd != -1) { + close(fd); + } ret = -EINVAL; } + /* Message handlers need to make sure that fd will be consumed. */ + fd = -1; + /* * REPLY_ACK feature handling. Other reply types has to be managed * directly in their request handlers. @@ -918,6 +954,9 @@ err: qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); close(u->slave_fd); u->slave_fd = -1; + if (fd != -1) { + close(fd); + } return; } -- cgit v1.1 From 6f80e6170ede13605817e5c0ca73db0de7bdf261 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 12 Apr 2018 23:12:30 +0800 Subject: virtio: support setting memory region based host notifier This patch introduces the support for setting memory region based host notifiers for virtio device. This is helpful when using a hardware accelerator for a virtio device, because hardware heavily depends on the notification, this will allow the guest driver in the VM to notify the hardware directly. Signed-off-by: Tiwei Bie Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-pci.c | 22 ++++++++++++++++++++++ hw/virtio/virtio.c | 13 +++++++++++++ 2 files changed, 35 insertions(+) (limited to 'hw') diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 1e8ab7b..5eb0c32 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1037,6 +1037,27 @@ assign_error: return r; } +static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n, + MemoryRegion *mr, bool assign) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + int offset; + + if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) || + virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) { + return -1; + } + + if (assign) { + offset = virtio_pci_queue_mem_mult(proxy) * n; + memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1); + } else { + memory_region_del_subregion(&proxy->notify.mr, mr); + } + + return 0; +} + static void virtio_pci_vmstate_change(DeviceState *d, bool running) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); @@ -2652,6 +2673,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data) k->has_extra_state = virtio_pci_has_extra_state; k->query_guest_notifiers = virtio_pci_query_guest_notifiers; k->set_guest_notifiers = virtio_pci_set_guest_notifiers; + k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr; k->vmstate_change = virtio_pci_vmstate_change; k->pre_plugged = virtio_pci_pre_plugged; k->device_plugged = virtio_pci_device_plugged; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 006d3d1..1debb01 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2454,6 +2454,19 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) return &vq->host_notifier; } +int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n, + MemoryRegion *mr, bool assign) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (k->set_host_notifier_mr) { + return k->set_host_notifier_mr(qbus->parent, n, mr, assign); + } + + return -1; +} + void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name) { g_free(vdev->bus_name); -- cgit v1.1 From 9952e807fd016c95f50372536f1ac65a601be6e4 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 2 May 2018 11:55:52 +0100 Subject: vhost-user+postcopy: Use qemu_set_nonblock Use qemu_set_nonblock rather than a simple fcntl; cleaner and I have no reason to change other flags. Reported-by: Peter Maydell Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 85d8fd2..41cbd8a 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1115,7 +1115,7 @@ static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp) error_setg(errp, "%s: Failed to get ufd", __func__); return -1; } - fcntl(ufd, F_SETFL, O_NONBLOCK); + qemu_set_nonblock(ufd); /* register ufd with userfault thread */ u->postcopy_fd.fd = ufd; -- cgit v1.1 From ebf2a499a5c43d5eaee2b70ab5ae655af49d935c Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 11 May 2018 21:59:40 -0700 Subject: hw/virtio: Fix brace Werror with clang 6.0.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The warning is hw/virtio/vhost-user.c:1319:26: error: suggest braces around initialization of subobject [-Werror,-Wmissing-braces] VhostUserMsg msg = { 0 }; ^ {} While the original code is correct, and technically exactly correct as per ISO C89, both GCC and Clang support plain empty set of braces as an extension. Cc: Michael S. Tsirkin Signed-off-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eric Blake Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 41cbd8a..ca554d4 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1355,7 +1355,7 @@ static bool vhost_user_requires_shm_log(struct vhost_dev *dev) static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr) { - VhostUserMsg msg = { 0 }; + VhostUserMsg msg = { }; assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); -- cgit v1.1 From 1a97a478e69534f70ebbed1f27c315dbdc23f94f Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 21 May 2018 10:32:00 -0600 Subject: nvdimm: fix typo in label-size definition Signed-off-by: Ross Zwisler Fixes: commit da6789c27c2e ("nvdimm: add a macro for property "label-size"") Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov Cc: Haozhong Zhang Cc: Michael S. Tsirkin Cc: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/mem/nvdimm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index acb656b..4087aca 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -89,7 +89,7 @@ static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp) static void nvdimm_init(Object *obj) { - object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int", + object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int", nvdimm_get_label_size, nvdimm_set_label_size, NULL, NULL, NULL); object_property_add_bool(obj, NVDIMM_UNARMED_PROP, -- cgit v1.1 From 36d2d52bdb45f5b753a61fdaf0fe7891f1f5b61d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:09 +0800 Subject: intel-iommu: send PSI always even if across PDEs SECURITY IMPLICATION: without this patch, any guest with both assigned device and a vIOMMU might encounter stale IO page mappings even if guest has already unmapped the page, which may lead to guest memory corruption. The stale mappings will only be limited to the guest's own memory range, so it should not affect the host memory or other guests on the host. During IOVA page table walking, there is a special case when the PSI covers one whole PDE (Page Directory Entry, which contains 512 Page Table Entries) or more. In the past, we skip that entry and we don't notify the IOMMU notifiers. This is not correct. We should send UNMAP notification to registered UNMAP notifiers in this case. For UNMAP only notifiers, this might cause IOTLBs cached in the devices even if they were already invalid. For MAP/UNMAP notifiers like vfio-pci, this will cause stale page mappings. This special case doesn't trigger often, but it is very easy to be triggered by nested device assignments, since in that case we'll possibly map the whole L2 guest RAM region into the device's IOVA address space (several GBs at least), which is far bigger than normal kernel driver usages of the device (tens of MBs normally). Without this patch applied to L1 QEMU, nested device assignment to L2 guests will dump some errors like: qemu-system-x86_64: VFIO_MAP_DMA: -17 qemu-system-x86_64: vfio_dma_map(0x557305420c30, 0xad000, 0x1000, 0x7f89a920d000) = -17 (File exists) CC: QEMU Stable Acked-by: Jason Wang [peterx: rewrite the commit message] Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index fb31de9..b359efd 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -722,6 +722,15 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); +static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level, + vtd_page_walk_hook hook_fn, void *private) +{ + assert(hook_fn); + trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr, + entry->addr_mask, entry->perm); + return hook_fn(entry, private); +} + /** * vtd_page_walk_level - walk over specific level for IOVA range * @@ -781,28 +790,37 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, */ entry_valid = read_cur | write_cur; + entry.target_as = &address_space_memory; + entry.iova = iova & subpage_mask; + entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); + entry.addr_mask = ~subpage_mask; + if (vtd_is_last_slpte(slpte, level)) { - entry.target_as = &address_space_memory; - entry.iova = iova & subpage_mask; /* NOTE: this is only meaningful if entry_valid == true */ entry.translated_addr = vtd_get_slpte_addr(slpte, aw); - entry.addr_mask = ~subpage_mask; - entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); if (!entry_valid && !notify_unmap) { trace_vtd_page_walk_skip_perm(iova, iova_next); goto next; } - trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr, - entry.addr_mask, entry.perm); - if (hook_fn) { - ret = hook_fn(&entry, private); - if (ret < 0) { - return ret; - } + ret = vtd_page_walk_one(&entry, level, hook_fn, private); + if (ret < 0) { + return ret; } } else { if (!entry_valid) { - trace_vtd_page_walk_skip_perm(iova, iova_next); + if (notify_unmap) { + /* + * The whole entry is invalid; unmap it all. + * Translated address is meaningless, zero it. + */ + entry.translated_addr = 0x0; + ret = vtd_page_walk_one(&entry, level, hook_fn, private); + if (ret < 0) { + return ret; + } + } else { + trace_vtd_page_walk_skip_perm(iova, iova_next); + } goto next; } ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova, -- cgit v1.1 From b4a4ba0d68f50f218ee3957b6638dbee32a5eeef Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:10 +0800 Subject: intel-iommu: remove IntelIOMMUNotifierNode That is not really necessary. Removing that node struct and put the list entry directly into VTDAddressSpace. It simplfies the code a lot. Since at it, rename the old notifiers_list into vtd_as_with_notifiers. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 41 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 30 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index b359efd..3df9045 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1248,10 +1248,10 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) static void vtd_iommu_replay_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; - QLIST_FOREACH(node, &s->notifiers_list, next) { - memory_region_iommu_replay_all(&node->vtd_as->iommu); + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + memory_region_iommu_replay_all(&vtd_as->iommu); } } @@ -1372,7 +1372,6 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) { - IntelIOMMUNotifierNode *node; VTDContextEntry ce; VTDAddressSpace *vtd_as; @@ -1381,8 +1380,7 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { @@ -1402,12 +1400,11 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; VTDContextEntry ce; int ret; - QLIST_FOREACH(node, &(s->notifiers_list), next) { - VTDAddressSpace *vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { @@ -2344,8 +2341,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; - IntelIOMMUNotifierNode *node = NULL; - IntelIOMMUNotifierNode *next_node = NULL; if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { error_report("We need to set caching-mode=1 for intel-iommu to enable " @@ -2354,21 +2349,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, } if (old == IOMMU_NOTIFIER_NONE) { - node = g_malloc0(sizeof(*node)); - node->vtd_as = vtd_as; - QLIST_INSERT_HEAD(&s->notifiers_list, node, next); - return; - } - - /* update notifier node with new flags */ - QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) { - if (node->vtd_as == vtd_as) { - if (new == IOMMU_NOTIFIER_NONE) { - QLIST_REMOVE(node, next); - g_free(node); - } - return; - } + QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(vtd_as, next); } } @@ -2838,12 +2821,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) static void vtd_address_space_unmap_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; VTDAddressSpace *vtd_as; IOMMUNotifier *n; - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { vtd_address_space_unmap(vtd_as, n); } @@ -3088,7 +3069,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) return; } - QLIST_INIT(&s->notifiers_list); + QLIST_INIT(&s->vtd_as_with_notifiers); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); -- cgit v1.1 From 1d9efa73e12ddf361ea997c2d532cc4afa6674d1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:11 +0800 Subject: intel-iommu: add iommu lock SECURITY IMPLICATION: this patch fixes a potential race when multiple threads access the IOMMU IOTLB cache. Add a per-iommu big lock to protect IOMMU status. Currently the only thing to be protected is the IOTLB/context cache, since that can be accessed even without BQL, e.g., in IO dataplane. Note that we don't need to protect device page tables since that's fully controlled by the guest kernel. However there is still possibility that malicious drivers will program the device to not obey the rule. In that case QEMU can't really do anything useful, instead the guest itself will be responsible for all uncertainties. CC: QEMU Stable Reported-by: Fam Zheng Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 56 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 9 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3df9045..8d4069d 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -128,6 +128,16 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, return new_val; } +static inline void vtd_iommu_lock(IntelIOMMUState *s) +{ + qemu_mutex_lock(&s->iommu_lock); +} + +static inline void vtd_iommu_unlock(IntelIOMMUState *s) +{ + qemu_mutex_unlock(&s->iommu_lock); +} + /* GHashTable functions */ static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) { @@ -172,9 +182,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, } /* Reset all the gen of VTDAddressSpace to zero and set the gen of - * IntelIOMMUState to 1. + * IntelIOMMUState to 1. Must be called with IOMMU lock held. */ -static void vtd_reset_context_cache(IntelIOMMUState *s) +static void vtd_reset_context_cache_locked(IntelIOMMUState *s) { VTDAddressSpace *vtd_as; VTDBus *vtd_bus; @@ -197,12 +207,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s) s->context_cache_gen = 1; } -static void vtd_reset_iotlb(IntelIOMMUState *s) +/* Must be called with IOMMU lock held. */ +static void vtd_reset_iotlb_locked(IntelIOMMUState *s) { assert(s->iotlb); g_hash_table_remove_all(s->iotlb); } +static void vtd_reset_iotlb(IntelIOMMUState *s) +{ + vtd_iommu_lock(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); +} + static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, uint32_t level) { @@ -215,6 +233,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; } +/* Must be called with IOMMU lock held */ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, hwaddr addr) { @@ -235,6 +254,7 @@ out: return entry; } +/* Must be with IOMMU lock held */ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint16_t domain_id, hwaddr addr, uint64_t slpte, uint8_t access_flags, uint32_t level) @@ -246,7 +266,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { trace_vtd_iotlb_reset("iotlb exceeds size limit"); - vtd_reset_iotlb(s); + vtd_reset_iotlb_locked(s); } entry->gfn = gfn; @@ -1106,7 +1126,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, IntelIOMMUState *s = vtd_as->iommu_state; VTDContextEntry ce; uint8_t bus_num = pci_bus_num(bus); - VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry; + VTDContextCacheEntry *cc_entry; uint64_t slpte, page_mask; uint32_t level; uint16_t source_id = vtd_make_source_id(bus_num, devfn); @@ -1123,6 +1143,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, */ assert(!vtd_is_interrupt_addr(addr)); + vtd_iommu_lock(s); + + cc_entry = &vtd_as->context_cache_entry; + /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { @@ -1182,7 +1206,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, * IOMMU region can be swapped back. */ vtd_pt_enable_fast_path(s, source_id); - + vtd_iommu_unlock(s); return true; } @@ -1203,6 +1227,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte, access_flags, level); out: + vtd_iommu_unlock(s); entry->iova = addr & page_mask; entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; entry->addr_mask = ~page_mask; @@ -1210,6 +1235,7 @@ out: return true; error: + vtd_iommu_unlock(s); entry->iova = 0; entry->translated_addr = 0; entry->addr_mask = 0; @@ -1258,10 +1284,13 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s) static void vtd_context_global_invalidate(IntelIOMMUState *s) { trace_vtd_inv_desc_cc_global(); + /* Protects context cache */ + vtd_iommu_lock(s); s->context_cache_gen++; if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { - vtd_reset_context_cache(s); + vtd_reset_context_cache_locked(s); } + vtd_iommu_unlock(s); vtd_switch_address_space_all(s); /* * From VT-d spec 6.5.2.1, a global context entry invalidation @@ -1313,7 +1342,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), VTD_PCI_FUNC(devfn_it)); + vtd_iommu_lock(s); vtd_as->context_cache_entry.context_cache_gen = 0; + vtd_iommu_unlock(s); /* * Do switch address space when needed, in case if the * device passthrough bit is switched. @@ -1377,8 +1408,10 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) trace_vtd_inv_desc_iotlb_domain(domain_id); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); + vtd_iommu_unlock(s); QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), @@ -1426,7 +1459,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, info.domain_id = domain_id; info.addr = addr; info.mask = ~((1 << am) - 1); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); + vtd_iommu_unlock(s); vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); } @@ -2929,8 +2964,10 @@ static void vtd_init(IntelIOMMUState *s) s->cap |= VTD_CAP_CM; } - vtd_reset_context_cache(s); - vtd_reset_iotlb(s); + vtd_iommu_lock(s); + vtd_reset_context_cache_locked(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); /* Define registers with default values and bit semantics */ vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); @@ -3070,6 +3107,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) } QLIST_INIT(&s->vtd_as_with_notifiers); + qemu_mutex_init(&s->iommu_lock); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); -- cgit v1.1 From 4f8a62a933a79094e44bc1b16b63bb23e62d67b4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:12 +0800 Subject: intel-iommu: only do page walk for MAP notifiers For UNMAP-only IOMMU notifiers, we don't need to walk the page tables. Fasten that procedure by skipping the page table walk. That should boost performance for UNMAP-only notifiers like vhost. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 8d4069d..38ccc74 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -138,6 +138,12 @@ static inline void vtd_iommu_unlock(IntelIOMMUState *s) qemu_mutex_unlock(&s->iommu_lock); } +/* Whether the address space needs to notify new mappings */ +static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) +{ + return as->notifier_flags & IOMMU_NOTIFIER_MAP; +} + /* GHashTable functions */ static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) { @@ -1436,14 +1442,36 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, VTDAddressSpace *vtd_as; VTDContextEntry ce; int ret; + hwaddr size = (1 << am) * VTD_PAGE_SIZE; QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE, - vtd_page_invalidate_notify_hook, - (void *)&vtd_as->iommu, true, s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* + * As long as we have MAP notifications registered in + * any of our IOMMU notifiers, we need to sync the + * shadow page table. + */ + vtd_page_walk(&ce, addr, addr + size, + vtd_page_invalidate_notify_hook, + (void *)&vtd_as->iommu, true, s->aw_bits); + } else { + /* + * For UNMAP-only notifiers, we don't need to walk the + * page tables. We just deliver the PSI down to + * invalidate caches. + */ + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = 0, + .addr_mask = size - 1, + .perm = IOMMU_NONE, + }; + memory_region_notify_iommu(&vtd_as->iommu, entry); + } } } } @@ -2383,6 +2411,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, exit(1); } + /* Update per-address-space notifier flags */ + vtd_as->notifier_flags = new; + if (old == IOMMU_NOTIFIER_NONE) { QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); } else if (new == IOMMU_NOTIFIER_NONE) { @@ -2891,8 +2922,11 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) PCI_FUNC(vtd_as->devfn), VTD_CONTEXT_ENTRY_DID(ce.hi), ce.hi, ce.lo); - vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false, - s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* This is required only for MAP typed notifiers */ + vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false, + s->aw_bits); + } } else { trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn)); -- cgit v1.1 From fe215b0cbb8c1f4b4af0a64aa5c02042080dd537 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:13 +0800 Subject: intel-iommu: introduce vtd_page_walk_info During the recursive page walking of IOVA page tables, some stack variables are constant variables and never changed during the whole page walking procedure. Isolate them into a struct so that we don't need to pass those contants down the stack every time and multiple times. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 84 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 32 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 38ccc74..e247269 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -748,9 +748,27 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); +/** + * Constant information used during page walking + * + * @hook_fn: hook func to be called when detected page + * @private: private data to be passed into hook func + * @notify_unmap: whether we should notify invalid entries + * @aw: maximum address width + */ +typedef struct { + vtd_page_walk_hook hook_fn; + void *private; + bool notify_unmap; + uint8_t aw; +} vtd_page_walk_info; + static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level, - vtd_page_walk_hook hook_fn, void *private) + vtd_page_walk_info *info) { + vtd_page_walk_hook hook_fn = info->hook_fn; + void *private = info->private; + assert(hook_fn); trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr, entry->addr_mask, entry->perm); @@ -763,17 +781,13 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level, * @addr: base GPA addr to start the walk * @start: IOVA range start address * @end: IOVA range end address (start <= addr < end) - * @hook_fn: hook func to be called when detected page - * @private: private data to be passed into hook func * @read: whether parent level has read permission * @write: whether parent level has write permission - * @notify_unmap: whether we should notify invalid entries - * @aw: maximum address width + * @info: constant information for the page walk */ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, - uint64_t end, vtd_page_walk_hook hook_fn, - void *private, uint32_t level, bool read, - bool write, bool notify_unmap, uint8_t aw) + uint64_t end, uint32_t level, bool read, + bool write, vtd_page_walk_info *info) { bool read_cur, write_cur, entry_valid; uint32_t offset; @@ -823,24 +837,24 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, if (vtd_is_last_slpte(slpte, level)) { /* NOTE: this is only meaningful if entry_valid == true */ - entry.translated_addr = vtd_get_slpte_addr(slpte, aw); - if (!entry_valid && !notify_unmap) { + entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); + if (!entry_valid && !info->notify_unmap) { trace_vtd_page_walk_skip_perm(iova, iova_next); goto next; } - ret = vtd_page_walk_one(&entry, level, hook_fn, private); + ret = vtd_page_walk_one(&entry, level, info); if (ret < 0) { return ret; } } else { if (!entry_valid) { - if (notify_unmap) { + if (info->notify_unmap) { /* * The whole entry is invalid; unmap it all. * Translated address is meaningless, zero it. */ entry.translated_addr = 0x0; - ret = vtd_page_walk_one(&entry, level, hook_fn, private); + ret = vtd_page_walk_one(&entry, level, info); if (ret < 0) { return ret; } @@ -849,10 +863,9 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, } goto next; } - ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova, - MIN(iova_next, end), hook_fn, private, - level - 1, read_cur, write_cur, - notify_unmap, aw); + ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), + iova, MIN(iova_next, end), level - 1, + read_cur, write_cur, info); if (ret < 0) { return ret; } @@ -871,28 +884,24 @@ next: * @ce: context entry to walk upon * @start: IOVA address to start the walk * @end: IOVA range end address (start <= addr < end) - * @hook_fn: the hook that to be called for each detected area - * @private: private data for the hook function - * @aw: maximum address width + * @info: page walking information struct */ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, - vtd_page_walk_hook hook_fn, void *private, - bool notify_unmap, uint8_t aw) + vtd_page_walk_info *info) { dma_addr_t addr = vtd_ce_get_slpt_base(ce); uint32_t level = vtd_ce_get_level(ce); - if (!vtd_iova_range_check(start, ce, aw)) { + if (!vtd_iova_range_check(start, ce, info->aw)) { return -VTD_FR_ADDR_BEYOND_MGAW; } - if (!vtd_iova_range_check(end, ce, aw)) { + if (!vtd_iova_range_check(end, ce, info->aw)) { /* Fix end so that it reaches the maximum */ - end = vtd_iova_limit(ce, aw); + end = vtd_iova_limit(ce, info->aw); } - return vtd_page_walk_level(addr, start, end, hook_fn, private, - level, true, true, notify_unmap, aw); + return vtd_page_walk_level(addr, start, end, level, true, true, info); } /* Map a device to its corresponding domain (context-entry) */ @@ -1449,14 +1458,19 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { if (vtd_as_has_map_notifier(vtd_as)) { + vtd_page_walk_info info = { + .hook_fn = vtd_page_invalidate_notify_hook, + .private = (void *)&vtd_as->iommu, + .notify_unmap = true, + .aw = s->aw_bits, + }; + /* * As long as we have MAP notifications registered in * any of our IOMMU notifiers, we need to sync the * shadow page table. */ - vtd_page_walk(&ce, addr, addr + size, - vtd_page_invalidate_notify_hook, - (void *)&vtd_as->iommu, true, s->aw_bits); + vtd_page_walk(&ce, addr, addr + size, &info); } else { /* * For UNMAP-only notifiers, we don't need to walk the @@ -2924,8 +2938,14 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) ce.hi, ce.lo); if (vtd_as_has_map_notifier(vtd_as)) { /* This is required only for MAP typed notifiers */ - vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false, - s->aw_bits); + vtd_page_walk_info info = { + .hook_fn = vtd_replay_hook, + .private = (void *)n, + .notify_unmap = false, + .aw = s->aw_bits, + }; + + vtd_page_walk(&ce, 0, ~0ULL, &info); } } else { trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), -- cgit v1.1 From 2f764fa87d2a81812b313dd6d998e10126292653 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:14 +0800 Subject: intel-iommu: pass in address space when page walk We pass in the VTDAddressSpace too. It'll be used in the follow up patches. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index e247269..a882894 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -754,9 +754,11 @@ typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); * @hook_fn: hook func to be called when detected page * @private: private data to be passed into hook func * @notify_unmap: whether we should notify invalid entries + * @as: VT-d address space of the device * @aw: maximum address width */ typedef struct { + VTDAddressSpace *as; vtd_page_walk_hook hook_fn; void *private; bool notify_unmap; @@ -1463,6 +1465,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, .private = (void *)&vtd_as->iommu, .notify_unmap = true, .aw = s->aw_bits, + .as = vtd_as, }; /* @@ -2943,6 +2946,7 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) .private = (void *)n, .notify_unmap = false, .aw = s->aw_bits, + .as = vtd_as, }; vtd_page_walk(&ce, 0, ~0ULL, &info); -- cgit v1.1 From d118c06ebbee2d23ddf873cae4a809311aa61310 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:15 +0800 Subject: intel-iommu: trace domain id during page walk This patch only modifies the trace points. Previously we were tracing page walk levels. They are redundant since we have page mask (size) already. Now we trace something much more useful which is the domain ID of the page walking. That can be very useful when we trace more than one devices on the same system, so that we can know which map is for which domain. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 16 ++++++++++------ hw/i386/trace-events | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index a882894..61bb3d3 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -756,6 +756,7 @@ typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); * @notify_unmap: whether we should notify invalid entries * @as: VT-d address space of the device * @aw: maximum address width + * @domain: domain ID of the page walk */ typedef struct { VTDAddressSpace *as; @@ -763,17 +764,18 @@ typedef struct { void *private; bool notify_unmap; uint8_t aw; + uint16_t domain_id; } vtd_page_walk_info; -static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level, - vtd_page_walk_info *info) +static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) { vtd_page_walk_hook hook_fn = info->hook_fn; void *private = info->private; assert(hook_fn); - trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr, - entry->addr_mask, entry->perm); + trace_vtd_page_walk_one(info->domain_id, entry->iova, + entry->translated_addr, entry->addr_mask, + entry->perm); return hook_fn(entry, private); } @@ -844,7 +846,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, trace_vtd_page_walk_skip_perm(iova, iova_next); goto next; } - ret = vtd_page_walk_one(&entry, level, info); + ret = vtd_page_walk_one(&entry, info); if (ret < 0) { return ret; } @@ -856,7 +858,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, * Translated address is meaningless, zero it. */ entry.translated_addr = 0x0; - ret = vtd_page_walk_one(&entry, level, info); + ret = vtd_page_walk_one(&entry, info); if (ret < 0) { return ret; } @@ -1466,6 +1468,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, .notify_unmap = true, .aw = s->aw_bits, .as = vtd_as, + .domain_id = domain_id, }; /* @@ -2947,6 +2950,7 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) .notify_unmap = false, .aw = s->aw_bits, .as = vtd_as, + .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi), }; vtd_page_walk(&ce, 0, ~0ULL, &info); diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 22d4464..ca23ba9 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -39,7 +39,7 @@ vtd_fault_disabled(void) "Fault processing disabled for context entry" vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64 -vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" -- cgit v1.1 From 63b88968f139b6a77f2f81e6f1eedf70c0170a85 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:17 +0800 Subject: intel-iommu: rework the page walk logic This patch fixes a potential small window that the DMA page table might be incomplete or invalid when the guest sends domain/context invalidations to a device. This can cause random DMA errors for assigned devices. This is a major change to the VT-d shadow page walking logic. It includes but is not limited to: - For each VTDAddressSpace, now we maintain what IOVA ranges we have mapped and what we have not. With that information, now we only send MAP or UNMAP when necessary. Say, we don't send MAP notifies if we know we have already mapped the range, meanwhile we don't send UNMAP notifies if we know we never mapped the range at all. - Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call in any places to resync the shadow page table for a device. - When we receive domain/context invalidation, we should not really run the replay logic, instead we use the new sync shadow page table API to resync the whole shadow page table without unmapping the whole region. After this change, we'll only do the page walk once for each domain invalidations (before this, it can be multiple, depending on number of notifiers per address space). While at it, the page walking logic is also refactored to be simpler. CC: QEMU Stable Reported-by: Jintack Lim Tested-by: Jintack Lim Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 213 ++++++++++++++++++++++++++++++++++++-------------- hw/i386/trace-events | 3 +- 2 files changed, 157 insertions(+), 59 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 61bb3d3..b5a09b7 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -769,10 +769,77 @@ typedef struct { static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) { + VTDAddressSpace *as = info->as; vtd_page_walk_hook hook_fn = info->hook_fn; void *private = info->private; + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (entry->perm == IOMMU_NONE && !info->notify_unmap) { + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } assert(hook_fn); + + /* Update local IOVA mapped ranges */ + if (entry->perm) { + if (mapped) { + /* If it's exactly the same translation, skip */ + if (!memcmp(mapped, &target, sizeof(target))) { + trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, + entry->translated_addr); + return 0; + } else { + /* + * Translation changed. Normally this should not + * happen, but it can happen when with buggy guest + * OSes. Note that there will be a small window that + * we don't have map at all. But that's the best + * effort we can do. The ideal way to emulate this is + * atomically modify the PTE to follow what has + * changed, but we can't. One example is that vfio + * driver only has VFIO_IOMMU_[UN]MAP_DMA but no + * interface to modify a mapping (meanwhile it seems + * meaningless to even provide one). Anyway, let's + * mark this as a TODO in case one day we'll have + * a better solution. + */ + IOMMUAccessFlags cache_perm = entry->perm; + int ret; + + /* Emulate an UNMAP */ + entry->perm = IOMMU_NONE; + trace_vtd_page_walk_one(info->domain_id, + entry->iova, + entry->translated_addr, + entry->addr_mask, + entry->perm); + ret = hook_fn(entry, private); + if (ret) { + return ret; + } + /* Drop any existing mapping */ + iova_tree_remove(as->iova_tree, &target); + /* Recover the correct permission */ + entry->perm = cache_perm; + } + } + iova_tree_insert(as->iova_tree, &target); + } else { + if (!mapped) { + /* Skip since we didn't map this range at all */ + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + iova_tree_remove(as->iova_tree, &target); + } + trace_vtd_page_walk_one(info->domain_id, entry->iova, entry->translated_addr, entry->addr_mask, entry->perm); @@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, */ entry_valid = read_cur | write_cur; - entry.target_as = &address_space_memory; - entry.iova = iova & subpage_mask; - entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); - entry.addr_mask = ~subpage_mask; - - if (vtd_is_last_slpte(slpte, level)) { - /* NOTE: this is only meaningful if entry_valid == true */ - entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); - if (!entry_valid && !info->notify_unmap) { - trace_vtd_page_walk_skip_perm(iova, iova_next); - goto next; - } - ret = vtd_page_walk_one(&entry, info); - if (ret < 0) { - return ret; - } - } else { - if (!entry_valid) { - if (info->notify_unmap) { - /* - * The whole entry is invalid; unmap it all. - * Translated address is meaningless, zero it. - */ - entry.translated_addr = 0x0; - ret = vtd_page_walk_one(&entry, info); - if (ret < 0) { - return ret; - } - } else { - trace_vtd_page_walk_skip_perm(iova, iova_next); - } - goto next; - } + if (!vtd_is_last_slpte(slpte, level) && entry_valid) { + /* + * This is a valid PDE (or even bigger than PDE). We need + * to walk one further level. + */ ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), iova, MIN(iova_next, end), level - 1, read_cur, write_cur, info); - if (ret < 0) { - return ret; - } + } else { + /* + * This means we are either: + * + * (1) the real page entry (either 4K page, or huge page) + * (2) the whole range is invalid + * + * In either case, we send an IOTLB notification down. + */ + entry.target_as = &address_space_memory; + entry.iova = iova & subpage_mask; + entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); + entry.addr_mask = ~subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ + entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); + ret = vtd_page_walk_one(&entry, info); + } + + if (ret < 0) { + return ret; } next: @@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return 0; } +static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, + void *private) +{ + memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); + return 0; +} + +/* If context entry is NULL, we'll try to fetch it on our own. */ +static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, + VTDContextEntry *ce, + hwaddr addr, hwaddr size) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + vtd_page_walk_info info = { + .hook_fn = vtd_sync_shadow_page_hook, + .private = (void *)&vtd_as->iommu, + .notify_unmap = true, + .aw = s->aw_bits, + .as = vtd_as, + }; + VTDContextEntry ce_cache; + int ret; + + if (ce) { + /* If the caller provided context entry, use it */ + ce_cache = *ce; + } else { + /* If the caller didn't provide ce, try to fetch */ + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce_cache); + if (ret) { + /* + * This should not really happen, but in case it happens, + * we just skip the sync for this time. After all we even + * don't have the root table pointer! + */ + trace_vtd_err("Detected invalid context entry when " + "trying to sync shadow page table"); + return 0; + } + } + + info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi); + + return vtd_page_walk(&ce_cache, addr, addr + size, &info); +} + +static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) +{ + return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX); +} + /* * Fetch translation type for specific device. Returns <0 if error * happens, otherwise return the shifted type to check against @@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s) VTDAddressSpace *vtd_as; QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } @@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, vtd_switch_address_space(vtd_as); /* * So a device is moving out of (or moving into) a - * domain, a replay() suites here to notify all the - * IOMMU_NOTIFIER_MAP registers about this change. + * domain, resync the shadow page table. * This won't bring bad even if we have no such * notifier registered - the IOMMU notification * framework will skip MAP notifications if that * happened. */ - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } @@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } -static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry, - void *private) -{ - memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); - return 0; -} - static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) @@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { if (vtd_as_has_map_notifier(vtd_as)) { - vtd_page_walk_info info = { - .hook_fn = vtd_page_invalidate_notify_hook, - .private = (void *)&vtd_as->iommu, - .notify_unmap = true, - .aw = s->aw_bits, - .as = vtd_as, - .domain_id = domain_id, - }; - /* * As long as we have MAP notifications registered in * any of our IOMMU notifiers, we need to sync the * shadow page table. */ - vtd_page_walk(&ce, addr, addr + size, &info); + vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); } else { /* * For UNMAP-only notifiers, we don't need to walk the @@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) vtd_dev_as->devfn = (uint8_t)devfn; vtd_dev_as->iommu_state = s; vtd_dev_as->context_cache_entry.context_cache_gen = 0; + vtd_dev_as->iova_tree = iova_tree_new(); /* * Memory region relationships looks like (Address range shows @@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) hwaddr start = n->start; hwaddr end = n->end; IntelIOMMUState *s = as->iommu_state; + DMAMap map; /* * Note: all the codes in this function has a assumption that IOVA @@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) VTD_PCI_FUNC(as->devfn), entry.iova, size); + map.iova = entry.iova; + map.size = entry.addr_mask; + iova_tree_remove(as->iova_tree, &map); + memory_region_notify_one(n, &entry); } diff --git a/hw/i386/trace-events b/hw/i386/trace-events index ca23ba9..e14d06e 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64 +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" -vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 -- cgit v1.1