diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2018-05-24 14:22:23 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2018-05-24 14:22:23 +0100 |
commit | 45eabb2ede0caf2f3dd0e1ace045d0915c53ef4d (patch) | |
tree | ff8ceaa4ff25dcc341bd03ce64b7d5a40e13028c /hw | |
parent | 37cbe4da617e87778ea324c3f5d08ba780bed1ea (diff) | |
parent | 63b88968f139b6a77f2f81e6f1eedf70c0170a85 (diff) | |
download | qemu-45eabb2ede0caf2f3dd0e1ace045d0915c53ef4d.zip qemu-45eabb2ede0caf2f3dd0e1ace045d0915c53ef4d.tar.gz qemu-45eabb2ede0caf2f3dd0e1ace045d0915c53ef4d.tar.bz2 |
Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
pc, pci, virtio, vhost: fixes, features
Beginning of merging vDPA, new PCI ID, a new virtio balloon stat, intel
iommu rework fixing a couple of security problems (no CVEs yet), fixes
all over the place.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# gpg: Signature made Wed 23 May 2018 15:41:32 BST
# gpg: using RSA key 281F0DB8D28D5469
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>"
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>"
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67
# Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469
* remotes/mst/tags/for_upstream: (28 commits)
intel-iommu: rework the page walk logic
util: implement simple iova tree
intel-iommu: trace domain id during page walk
intel-iommu: pass in address space when page walk
intel-iommu: introduce vtd_page_walk_info
intel-iommu: only do page walk for MAP notifiers
intel-iommu: add iommu lock
intel-iommu: remove IntelIOMMUNotifierNode
intel-iommu: send PSI always even if across PDEs
nvdimm: fix typo in label-size definition
contrib/vhost-user-blk: enable protocol feature for vhost-user-blk
hw/virtio: Fix brace Werror with clang 6.0.0
libvhost-user: Send messages with no data
vhost-user+postcopy: Use qemu_set_nonblock
virtio: support setting memory region based host notifier
vhost-user: support receiving file descriptors in slave_read
vhost-user: add Net prefix to internal state structure
linux-headers: add kvm header for mips
linux-headers: add unistd.h on all arches
update-linux-headers.sh: unistd.h, kvm consistency
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/i386/intel_iommu.c | 396 | ||||
-rw-r--r-- | hw/i386/kvm/clock.c | 2 | ||||
-rw-r--r-- | hw/i386/trace-events | 5 | ||||
-rw-r--r-- | hw/mem/nvdimm.c | 2 | ||||
-rw-r--r-- | hw/pci-host/q35.c | 17 | ||||
-rw-r--r-- | hw/virtio/trace-events | 1 | ||||
-rw-r--r-- | hw/virtio/vhost-user.c | 45 | ||||
-rw-r--r-- | hw/virtio/vhost.c | 7 | ||||
-rw-r--r-- | hw/virtio/virtio-balloon.c | 2 | ||||
-rw-r--r-- | hw/virtio/virtio-pci.c | 22 | ||||
-rw-r--r-- | hw/virtio/virtio.c | 13 |
11 files changed, 399 insertions, 113 deletions
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index fb31de9..b5a09b7 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -128,6 +128,22 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, return new_val; } +static inline void vtd_iommu_lock(IntelIOMMUState *s) +{ + qemu_mutex_lock(&s->iommu_lock); +} + +static inline void vtd_iommu_unlock(IntelIOMMUState *s) +{ + qemu_mutex_unlock(&s->iommu_lock); +} + +/* Whether the address space needs to notify new mappings */ +static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) +{ + return as->notifier_flags & IOMMU_NOTIFIER_MAP; +} + /* GHashTable functions */ static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) { @@ -172,9 +188,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, } /* Reset all the gen of VTDAddressSpace to zero and set the gen of - * IntelIOMMUState to 1. + * IntelIOMMUState to 1. Must be called with IOMMU lock held. */ -static void vtd_reset_context_cache(IntelIOMMUState *s) +static void vtd_reset_context_cache_locked(IntelIOMMUState *s) { VTDAddressSpace *vtd_as; VTDBus *vtd_bus; @@ -197,12 +213,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s) s->context_cache_gen = 1; } -static void vtd_reset_iotlb(IntelIOMMUState *s) +/* Must be called with IOMMU lock held. */ +static void vtd_reset_iotlb_locked(IntelIOMMUState *s) { assert(s->iotlb); g_hash_table_remove_all(s->iotlb); } +static void vtd_reset_iotlb(IntelIOMMUState *s) +{ + vtd_iommu_lock(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); +} + static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, uint32_t level) { @@ -215,6 +239,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; } +/* Must be called with IOMMU lock held */ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, hwaddr addr) { @@ -235,6 +260,7 @@ out: return entry; } +/* Must be with IOMMU lock held */ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint16_t domain_id, hwaddr addr, uint64_t slpte, uint8_t access_flags, uint32_t level) @@ -246,7 +272,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { trace_vtd_iotlb_reset("iotlb exceeds size limit"); - vtd_reset_iotlb(s); + vtd_reset_iotlb_locked(s); } entry->gfn = gfn; @@ -723,22 +749,116 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); /** + * Constant information used during page walking + * + * @hook_fn: hook func to be called when detected page + * @private: private data to be passed into hook func + * @notify_unmap: whether we should notify invalid entries + * @as: VT-d address space of the device + * @aw: maximum address width + * @domain: domain ID of the page walk + */ +typedef struct { + VTDAddressSpace *as; + vtd_page_walk_hook hook_fn; + void *private; + bool notify_unmap; + uint8_t aw; + uint16_t domain_id; +} vtd_page_walk_info; + +static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) +{ + VTDAddressSpace *as = info->as; + vtd_page_walk_hook hook_fn = info->hook_fn; + void *private = info->private; + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (entry->perm == IOMMU_NONE && !info->notify_unmap) { + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + + assert(hook_fn); + + /* Update local IOVA mapped ranges */ + if (entry->perm) { + if (mapped) { + /* If it's exactly the same translation, skip */ + if (!memcmp(mapped, &target, sizeof(target))) { + trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, + entry->translated_addr); + return 0; + } else { + /* + * Translation changed. Normally this should not + * happen, but it can happen when with buggy guest + * OSes. Note that there will be a small window that + * we don't have map at all. But that's the best + * effort we can do. The ideal way to emulate this is + * atomically modify the PTE to follow what has + * changed, but we can't. One example is that vfio + * driver only has VFIO_IOMMU_[UN]MAP_DMA but no + * interface to modify a mapping (meanwhile it seems + * meaningless to even provide one). Anyway, let's + * mark this as a TODO in case one day we'll have + * a better solution. + */ + IOMMUAccessFlags cache_perm = entry->perm; + int ret; + + /* Emulate an UNMAP */ + entry->perm = IOMMU_NONE; + trace_vtd_page_walk_one(info->domain_id, + entry->iova, + entry->translated_addr, + entry->addr_mask, + entry->perm); + ret = hook_fn(entry, private); + if (ret) { + return ret; + } + /* Drop any existing mapping */ + iova_tree_remove(as->iova_tree, &target); + /* Recover the correct permission */ + entry->perm = cache_perm; + } + } + iova_tree_insert(as->iova_tree, &target); + } else { + if (!mapped) { + /* Skip since we didn't map this range at all */ + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + iova_tree_remove(as->iova_tree, &target); + } + + trace_vtd_page_walk_one(info->domain_id, entry->iova, + entry->translated_addr, entry->addr_mask, + entry->perm); + return hook_fn(entry, private); +} + +/** * vtd_page_walk_level - walk over specific level for IOVA range * * @addr: base GPA addr to start the walk * @start: IOVA range start address * @end: IOVA range end address (start <= addr < end) - * @hook_fn: hook func to be called when detected page - * @private: private data to be passed into hook func * @read: whether parent level has read permission * @write: whether parent level has write permission - * @notify_unmap: whether we should notify invalid entries - * @aw: maximum address width + * @info: constant information for the page walk */ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, - uint64_t end, vtd_page_walk_hook hook_fn, - void *private, uint32_t level, bool read, - bool write, bool notify_unmap, uint8_t aw) + uint64_t end, uint32_t level, bool read, + bool write, vtd_page_walk_info *info) { bool read_cur, write_cur, entry_valid; uint32_t offset; @@ -781,37 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, */ entry_valid = read_cur | write_cur; - if (vtd_is_last_slpte(slpte, level)) { + if (!vtd_is_last_slpte(slpte, level) && entry_valid) { + /* + * This is a valid PDE (or even bigger than PDE). We need + * to walk one further level. + */ + ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), + iova, MIN(iova_next, end), level - 1, + read_cur, write_cur, info); + } else { + /* + * This means we are either: + * + * (1) the real page entry (either 4K page, or huge page) + * (2) the whole range is invalid + * + * In either case, we send an IOTLB notification down. + */ entry.target_as = &address_space_memory; entry.iova = iova & subpage_mask; - /* NOTE: this is only meaningful if entry_valid == true */ - entry.translated_addr = vtd_get_slpte_addr(slpte, aw); - entry.addr_mask = ~subpage_mask; entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); - if (!entry_valid && !notify_unmap) { - trace_vtd_page_walk_skip_perm(iova, iova_next); - goto next; - } - trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr, - entry.addr_mask, entry.perm); - if (hook_fn) { - ret = hook_fn(&entry, private); - if (ret < 0) { - return ret; - } - } - } else { - if (!entry_valid) { - trace_vtd_page_walk_skip_perm(iova, iova_next); - goto next; - } - ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova, - MIN(iova_next, end), hook_fn, private, - level - 1, read_cur, write_cur, - notify_unmap, aw); - if (ret < 0) { - return ret; - } + entry.addr_mask = ~subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ + entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); + ret = vtd_page_walk_one(&entry, info); + } + + if (ret < 0) { + return ret; } next: @@ -827,28 +944,24 @@ next: * @ce: context entry to walk upon * @start: IOVA address to start the walk * @end: IOVA range end address (start <= addr < end) - * @hook_fn: the hook that to be called for each detected area - * @private: private data for the hook function - * @aw: maximum address width + * @info: page walking information struct */ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, - vtd_page_walk_hook hook_fn, void *private, - bool notify_unmap, uint8_t aw) + vtd_page_walk_info *info) { dma_addr_t addr = vtd_ce_get_slpt_base(ce); uint32_t level = vtd_ce_get_level(ce); - if (!vtd_iova_range_check(start, ce, aw)) { + if (!vtd_iova_range_check(start, ce, info->aw)) { return -VTD_FR_ADDR_BEYOND_MGAW; } - if (!vtd_iova_range_check(end, ce, aw)) { + if (!vtd_iova_range_check(end, ce, info->aw)) { /* Fix end so that it reaches the maximum */ - end = vtd_iova_limit(ce, aw); + end = vtd_iova_limit(ce, info->aw); } - return vtd_page_walk_level(addr, start, end, hook_fn, private, - level, true, true, notify_unmap, aw); + return vtd_page_walk_level(addr, start, end, level, true, true, info); } /* Map a device to its corresponding domain (context-entry) */ @@ -907,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return 0; } +static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, + void *private) +{ + memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); + return 0; +} + +/* If context entry is NULL, we'll try to fetch it on our own. */ +static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, + VTDContextEntry *ce, + hwaddr addr, hwaddr size) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + vtd_page_walk_info info = { + .hook_fn = vtd_sync_shadow_page_hook, + .private = (void *)&vtd_as->iommu, + .notify_unmap = true, + .aw = s->aw_bits, + .as = vtd_as, + }; + VTDContextEntry ce_cache; + int ret; + + if (ce) { + /* If the caller provided context entry, use it */ + ce_cache = *ce; + } else { + /* If the caller didn't provide ce, try to fetch */ + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce_cache); + if (ret) { + /* + * This should not really happen, but in case it happens, + * we just skip the sync for this time. After all we even + * don't have the root table pointer! + */ + trace_vtd_err("Detected invalid context entry when " + "trying to sync shadow page table"); + return 0; + } + } + + info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi); + + return vtd_page_walk(&ce_cache, addr, addr + size, &info); +} + +static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) +{ + return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX); +} + /* * Fetch translation type for specific device. Returns <0 if error * happens, otherwise return the shifted type to check against @@ -1088,7 +1253,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, IntelIOMMUState *s = vtd_as->iommu_state; VTDContextEntry ce; uint8_t bus_num = pci_bus_num(bus); - VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry; + VTDContextCacheEntry *cc_entry; uint64_t slpte, page_mask; uint32_t level; uint16_t source_id = vtd_make_source_id(bus_num, devfn); @@ -1105,6 +1270,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, */ assert(!vtd_is_interrupt_addr(addr)); + vtd_iommu_lock(s); + + cc_entry = &vtd_as->context_cache_entry; + /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { @@ -1164,7 +1333,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, * IOMMU region can be swapped back. */ vtd_pt_enable_fast_path(s, source_id); - + vtd_iommu_unlock(s); return true; } @@ -1185,6 +1354,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte, access_flags, level); out: + vtd_iommu_unlock(s); entry->iova = addr & page_mask; entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; entry->addr_mask = ~page_mask; @@ -1192,6 +1362,7 @@ out: return true; error: + vtd_iommu_unlock(s); entry->iova = 0; entry->translated_addr = 0; entry->addr_mask = 0; @@ -1230,20 +1401,23 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) static void vtd_iommu_replay_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; - QLIST_FOREACH(node, &s->notifiers_list, next) { - memory_region_iommu_replay_all(&node->vtd_as->iommu); + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + vtd_sync_shadow_page_table(vtd_as); } } static void vtd_context_global_invalidate(IntelIOMMUState *s) { trace_vtd_inv_desc_cc_global(); + /* Protects context cache */ + vtd_iommu_lock(s); s->context_cache_gen++; if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { - vtd_reset_context_cache(s); + vtd_reset_context_cache_locked(s); } + vtd_iommu_unlock(s); vtd_switch_address_space_all(s); /* * From VT-d spec 6.5.2.1, a global context entry invalidation @@ -1295,7 +1469,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), VTD_PCI_FUNC(devfn_it)); + vtd_iommu_lock(s); vtd_as->context_cache_entry.context_cache_gen = 0; + vtd_iommu_unlock(s); /* * Do switch address space when needed, in case if the * device passthrough bit is switched. @@ -1303,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, vtd_switch_address_space(vtd_as); /* * So a device is moving out of (or moving into) a - * domain, a replay() suites here to notify all the - * IOMMU_NOTIFIER_MAP registers about this change. + * domain, resync the shadow page table. * This won't bring bad even if we have no such * notifier registered - the IOMMU notification * framework will skip MAP notifications if that * happened. */ - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } @@ -1354,48 +1529,60 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) { - IntelIOMMUNotifierNode *node; VTDContextEntry ce; VTDAddressSpace *vtd_as; trace_vtd_inv_desc_iotlb_domain(domain_id); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); + vtd_iommu_unlock(s); - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } -static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry, - void *private) -{ - memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); - return 0; -} - static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; VTDContextEntry ce; int ret; + hwaddr size = (1 << am) * VTD_PAGE_SIZE; - QLIST_FOREACH(node, &(s->notifiers_list), next) { - VTDAddressSpace *vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE, - vtd_page_invalidate_notify_hook, - (void *)&vtd_as->iommu, true, s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* + * As long as we have MAP notifications registered in + * any of our IOMMU notifiers, we need to sync the + * shadow page table. + */ + vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); + } else { + /* + * For UNMAP-only notifiers, we don't need to walk the + * page tables. We just deliver the PSI down to + * invalidate caches. + */ + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = 0, + .addr_mask = size - 1, + .perm = IOMMU_NONE, + }; + memory_region_notify_iommu(&vtd_as->iommu, entry); + } } } } @@ -1411,7 +1598,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, info.domain_id = domain_id; info.addr = addr; info.mask = ~((1 << am) - 1); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); + vtd_iommu_unlock(s); vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); } @@ -2326,8 +2515,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; - IntelIOMMUNotifierNode *node = NULL; - IntelIOMMUNotifierNode *next_node = NULL; if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { error_report("We need to set caching-mode=1 for intel-iommu to enable " @@ -2335,22 +2522,13 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, exit(1); } - if (old == IOMMU_NOTIFIER_NONE) { - node = g_malloc0(sizeof(*node)); - node->vtd_as = vtd_as; - QLIST_INSERT_HEAD(&s->notifiers_list, node, next); - return; - } + /* Update per-address-space notifier flags */ + vtd_as->notifier_flags = new; - /* update notifier node with new flags */ - QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) { - if (node->vtd_as == vtd_as) { - if (new == IOMMU_NOTIFIER_NONE) { - QLIST_REMOVE(node, next); - g_free(node); - } - return; - } + if (old == IOMMU_NOTIFIER_NONE) { + QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(vtd_as, next); } } @@ -2719,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) vtd_dev_as->devfn = (uint8_t)devfn; vtd_dev_as->iommu_state = s; vtd_dev_as->context_cache_entry.context_cache_gen = 0; + vtd_dev_as->iova_tree = iova_tree_new(); /* * Memory region relationships looks like (Address range shows @@ -2771,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) hwaddr start = n->start; hwaddr end = n->end; IntelIOMMUState *s = as->iommu_state; + DMAMap map; /* * Note: all the codes in this function has a assumption that IOVA @@ -2815,17 +2995,19 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) VTD_PCI_FUNC(as->devfn), entry.iova, size); + map.iova = entry.iova; + map.size = entry.addr_mask; + iova_tree_remove(as->iova_tree, &map); + memory_region_notify_one(n, &entry); } static void vtd_address_space_unmap_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; VTDAddressSpace *vtd_as; IOMMUNotifier *n; - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { vtd_address_space_unmap(vtd_as, n); } @@ -2857,8 +3039,19 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) PCI_FUNC(vtd_as->devfn), VTD_CONTEXT_ENTRY_DID(ce.hi), ce.hi, ce.lo); - vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false, - s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* This is required only for MAP typed notifiers */ + vtd_page_walk_info info = { + .hook_fn = vtd_replay_hook, + .private = (void *)n, + .notify_unmap = false, + .aw = s->aw_bits, + .as = vtd_as, + .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi), + }; + + vtd_page_walk(&ce, 0, ~0ULL, &info); + } } else { trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn)); @@ -2930,8 +3123,10 @@ static void vtd_init(IntelIOMMUState *s) s->cap |= VTD_CAP_CM; } - vtd_reset_context_cache(s); - vtd_reset_iotlb(s); + vtd_iommu_lock(s); + vtd_reset_context_cache_locked(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); /* Define registers with default values and bit semantics */ vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); @@ -3070,7 +3265,8 @@ static void vtd_realize(DeviceState *dev, Error **errp) return; } - QLIST_INIT(&s->notifiers_list); + QLIST_INIT(&s->vtd_as_with_notifiers); + qemu_mutex_init(&s->iommu_lock); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 7dac319..0bf1c60 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -26,7 +26,7 @@ #include "qapi/error.h" #include <linux/kvm.h> -#include <linux/kvm_para.h> +#include "standard-headers/asm-x86/kvm_para.h" #define TYPE_KVM_CLOCK "kvmclock" #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK) diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 22d4464..e14d06e 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -39,9 +39,10 @@ vtd_fault_disabled(void) "Fault processing disabled for context entry" vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64 -vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64 +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" -vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index acb656b..4087aca 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -89,7 +89,7 @@ static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp) static void nvdimm_init(Object *obj) { - object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int", + object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int", nvdimm_get_label_size, nvdimm_set_label_size, NULL, NULL, NULL); object_property_add_bool(obj, NVDIMM_UNARMED_PROP, diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index a36a119..02f9576 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -535,13 +535,15 @@ static void mch_realize(PCIDevice *d, Error **errp) /* if *disabled* show SMRAM to all CPUs */ memory_region_init_alias(&mch->smram_region, OBJECT(mch), "smram-region", - mch->pci_address_space, 0xa0000, 0x20000); - memory_region_add_subregion_overlap(mch->system_memory, 0xa0000, + mch->pci_address_space, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); + memory_region_add_subregion_overlap(mch->system_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, &mch->smram_region, 1); memory_region_set_enabled(&mch->smram_region, true); memory_region_init_alias(&mch->open_high_smram, OBJECT(mch), "smram-open-high", - mch->ram_memory, 0xa0000, 0x20000); + mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); memory_region_add_subregion_overlap(mch->system_memory, 0xfeda0000, &mch->open_high_smram, 1); memory_region_set_enabled(&mch->open_high_smram, false); @@ -550,11 +552,14 @@ static void mch_realize(PCIDevice *d, Error **errp) memory_region_init(&mch->smram, OBJECT(mch), "smram", 1ull << 32); memory_region_set_enabled(&mch->smram, true); memory_region_init_alias(&mch->low_smram, OBJECT(mch), "smram-low", - mch->ram_memory, 0xa0000, 0x20000); + mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); memory_region_set_enabled(&mch->low_smram, true); - memory_region_add_subregion(&mch->smram, 0xa0000, &mch->low_smram); + memory_region_add_subregion(&mch->smram, MCH_HOST_BRIDGE_SMRAM_C_BASE, + &mch->low_smram); memory_region_init_alias(&mch->high_smram, OBJECT(mch), "smram-high", - mch->ram_memory, 0xa0000, 0x20000); + mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); memory_region_set_enabled(&mch->high_smram, true); memory_region_add_subregion(&mch->smram, 0xfeda0000, &mch->high_smram); diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 1422ff0..07bcbe9 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -6,6 +6,7 @@ vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64 vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64 vhost_section(const char *name, int r) "%s:%d" +vhost_iotlb_miss(void *dev, int step) "%p step %d" # hw/virtio/vhost-user.c vhost_user_postcopy_end_entry(void) "" diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 38da869..ca554d4 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -852,14 +852,44 @@ static void slave_read(void *opaque) VhostUserHeader hdr = { 0, }; VhostUserPayload payload = { 0, }; int size, ret = 0; + struct iovec iov; + struct msghdr msgh; + int fd = -1; + char control[CMSG_SPACE(sizeof(fd))]; + struct cmsghdr *cmsg; + size_t fdsize; + + memset(&msgh, 0, sizeof(msgh)); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); /* Read header */ - size = read(u->slave_fd, &hdr, VHOST_USER_HDR_SIZE); + iov.iov_base = &hdr; + iov.iov_len = VHOST_USER_HDR_SIZE; + + size = recvmsg(u->slave_fd, &msgh, 0); if (size != VHOST_USER_HDR_SIZE) { error_report("Failed to read from slave."); goto err; } + if (msgh.msg_flags & MSG_CTRUNC) { + error_report("Truncated message."); + goto err; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + fdsize = cmsg->cmsg_len - CMSG_LEN(0); + memcpy(&fd, CMSG_DATA(cmsg), fdsize); + break; + } + } + if (hdr.size > VHOST_USER_PAYLOAD_SIZE) { error_report("Failed to read msg header." " Size %d exceeds the maximum %zu.", hdr.size, @@ -883,9 +913,15 @@ static void slave_read(void *opaque) break; default: error_report("Received unexpected msg type."); + if (fd != -1) { + close(fd); + } ret = -EINVAL; } + /* Message handlers need to make sure that fd will be consumed. */ + fd = -1; + /* * REPLY_ACK feature handling. Other reply types has to be managed * directly in their request handlers. @@ -918,6 +954,9 @@ err: qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); close(u->slave_fd); u->slave_fd = -1; + if (fd != -1) { + close(fd); + } return; } @@ -1076,7 +1115,7 @@ static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp) error_setg(errp, "%s: Failed to get ufd", __func__); return -1; } - fcntl(ufd, F_SETFL, O_NONBLOCK); + qemu_set_nonblock(ufd); /* register ufd with userfault thread */ u->postcopy_fd.fd = ufd; @@ -1316,7 +1355,7 @@ static bool vhost_user_requires_shm_log(struct vhost_dev *dev) static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr) { - VhostUserMsg msg = { 0 }; + VhostUserMsg msg = { }; assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 9d5850a..b082900 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -894,12 +894,15 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) rcu_read_lock(); + trace_vhost_iotlb_miss(dev, 1); + iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, iova, write); if (iotlb.target_as != NULL) { ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, &uaddr, &len); if (ret) { + trace_vhost_iotlb_miss(dev, 3); error_report("Fail to lookup the translated address " "%"PRIx64, iotlb.translated_addr); goto out; @@ -911,10 +914,14 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, iotlb.perm); if (ret) { + trace_vhost_iotlb_miss(dev, 4); error_report("Fail to update device iotlb"); goto out; } } + + trace_vhost_iotlb_miss(dev, 2); + out: rcu_read_unlock(); diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index f456cea..1f7a87f 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -52,6 +52,8 @@ static const char *balloon_stat_names[] = { [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory", [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory", [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches", + [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc", + [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail", [VIRTIO_BALLOON_S_NR] = NULL }; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 1e8ab7b..5eb0c32 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1037,6 +1037,27 @@ assign_error: return r; } +static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n, + MemoryRegion *mr, bool assign) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + int offset; + + if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) || + virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) { + return -1; + } + + if (assign) { + offset = virtio_pci_queue_mem_mult(proxy) * n; + memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1); + } else { + memory_region_del_subregion(&proxy->notify.mr, mr); + } + + return 0; +} + static void virtio_pci_vmstate_change(DeviceState *d, bool running) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); @@ -2652,6 +2673,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data) k->has_extra_state = virtio_pci_has_extra_state; k->query_guest_notifiers = virtio_pci_query_guest_notifiers; k->set_guest_notifiers = virtio_pci_set_guest_notifiers; + k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr; k->vmstate_change = virtio_pci_vmstate_change; k->pre_plugged = virtio_pci_pre_plugged; k->device_plugged = virtio_pci_device_plugged; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 006d3d1..1debb01 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2454,6 +2454,19 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) return &vq->host_notifier; } +int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n, + MemoryRegion *mr, bool assign) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (k->set_host_notifier_mr) { + return k->set_host_notifier_mr(qbus->parent, n, mr, assign); + } + + return -1; +} + void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name) { g_free(vdev->bus_name); |