diff options
Diffstat (limited to 'hw/vfio/common.c')
-rw-r--r-- | hw/vfio/common.c | 315 |
1 files changed, 303 insertions, 12 deletions
diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ae5654f..3f0d111 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -36,6 +36,7 @@ #include "qemu/range.h" #include "sysemu/kvm.h" #include "sysemu/reset.h" +#include "sysemu/runstate.h" #include "trace.h" #include "qapi/error.h" #include "migration/migration.h" @@ -134,6 +135,29 @@ static const char *index_to_str(VFIODevice *vbasedev, int index) } } +static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) +{ + switch (container->iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + /* + * We support coordinated discarding of RAM via the RamDiscardManager. + */ + return ram_block_uncoordinated_discard_disable(state); + default: + /* + * VFIO_SPAPR_TCE_IOMMU most probably works just fine with + * RamDiscardManager, however, it is completely untested. + * + * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does + * completely the opposite of managing mapping/pinning dynamically as + * required by RamDiscardManager. We would have to special-case sections + * with a RamDiscardManager. + */ + return ram_block_discard_disable(state); + } +} + int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, int action, int fd, Error **errp) { @@ -569,6 +593,44 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, error_report("iommu map to non memory area %"HWADDR_PRIx"", xlat); return false; + } else if (memory_region_has_ram_discard_manager(mr)) { + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr); + MemoryRegionSection tmp = { + .mr = mr, + .offset_within_region = xlat, + .size = int128_make64(len), + }; + + /* + * Malicious VMs can map memory into the IOMMU, which is expected + * to remain discarded. vfio will pin all pages, populating memory. + * Disallow that. vmstate priorities make sure any RamDiscardManager + * were already restored before IOMMUs are restored. + */ + if (!ram_discard_manager_is_populated(rdm, &tmp)) { + error_report("iommu map to discarded memory (e.g., unplugged via" + " virtio-mem): %"HWADDR_PRIx"", + iotlb->translated_addr); + return false; + } + + /* + * Malicious VMs might trigger discarding of IOMMU-mapped memory. The + * pages will remain pinned inside vfio until unmapped, resulting in a + * higher memory consumption than expected. If memory would get + * populated again later, there would be an inconsistency between pages + * pinned by vfio and pages seen by QEMU. This is the case until + * unmapped from the IOMMU (e.g., during device reset). + * + * With malicious guests, we really only care about pinning more memory + * than expected. RLIMIT_MEMLOCK set for the user/process can never be + * exceeded and can be used to mitigate this problem. + */ + warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" + " RAM (e.g., virtio-mem) works, however, malicious" + " guests can trigger pinning of more memory than" + " intended via an IOMMU. It's possible to mitigate " + " by setting/adjusting RLIMIT_MEMLOCK."); } /* @@ -649,6 +711,153 @@ out: rcu_read_unlock(); } +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + int ret; + + /* Unmap with a single call. */ + ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); + if (ret) { + error_report("%s: vfio_dma_unmap() failed: %s", __func__, + strerror(-ret)); + } +} + +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + const hwaddr end = section->offset_within_region + + int128_get64(section->size); + hwaddr start, next, iova; + void *vaddr; + int ret; + + /* + * Map in (aligned within memory region) minimum granularity, so we can + * unmap in minimum granularity later. + */ + for (start = section->offset_within_region; start < end; start = next) { + next = ROUND_UP(start + 1, vrdl->granularity); + next = MIN(next, end); + + iova = start - section->offset_within_region + + section->offset_within_address_space; + vaddr = memory_region_get_ram_ptr(section->mr) + start; + + ret = vfio_dma_map(vrdl->container, iova, next - start, + vaddr, section->readonly); + if (ret) { + /* Rollback */ + vfio_ram_discard_notify_discard(rdl, section); + return ret; + } + } + return 0; +} + +static void vfio_register_ram_discard_listener(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl; + + /* Ignore some corner cases not relevant in practice. */ + g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); + g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, + TARGET_PAGE_SIZE)); + g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); + + vrdl = g_new0(VFIORamDiscardListener, 1); + vrdl->container = container; + vrdl->mr = section->mr; + vrdl->offset_within_address_space = section->offset_within_address_space; + vrdl->size = int128_get64(section->size); + vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, + section->mr); + + g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); + g_assert(vrdl->granularity >= 1 << ctz64(container->pgsizes)); + + ram_discard_listener_init(&vrdl->listener, + vfio_ram_discard_notify_populate, + vfio_ram_discard_notify_discard, true); + ram_discard_manager_register_listener(rdm, &vrdl->listener, section); + QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); + + /* + * Sanity-check if we have a theoretically problematic setup where we could + * exceed the maximum number of possible DMA mappings over time. We assume + * that each mapped section in the same address space as a RamDiscardManager + * section consumes exactly one DMA mapping, with the exception of + * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections + * in the same address space as RamDiscardManager sections. + * + * We assume that each section in the address space consumes one memslot. + * We take the number of KVM memory slots as a best guess for the maximum + * number of sections in the address space we could have over time, + * also consuming DMA mappings. + */ + if (container->dma_max_mappings) { + unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; + +#ifdef CONFIG_KVM + if (kvm_enabled()) { + max_memslots = kvm_get_max_memslots(); + } +#endif + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + hwaddr start, end; + + start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, + vrdl->granularity); + end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, + vrdl->granularity); + vrdl_mappings += (end - start) / vrdl->granularity; + vrdl_count++; + } + + if (vrdl_mappings + max_memslots - vrdl_count > + container->dma_max_mappings) { + warn_report("%s: possibly running out of DMA mappings. E.g., try" + " increasing the 'block-size' of virtio-mem devies." + " Maximum possible DMA mappings: %d, Maximum possible" + " memslots: %d", __func__, container->dma_max_mappings, + max_memslots); + } + } +} + +static void vfio_unregister_ram_discard_listener(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to unregister missing RAM discard listener"); + } + + ram_discard_manager_unregister_listener(rdm, &vrdl->listener); + QLIST_REMOVE(vrdl, next); + g_free(vrdl); +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { @@ -810,6 +1019,16 @@ static void vfio_listener_region_add(MemoryListener *listener, /* Here we assume that memory_region_is_ram(section->mr)==true */ + /* + * For RAM memory regions with a RamDiscardManager, we only want to map the + * actually populated parts - and update the mapping whenever we're notified + * about changes. + */ + if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_register_ram_discard_listener(container, section); + return; + } + vaddr = memory_region_get_ram_ptr(section->mr) + section->offset_within_region + (iova - section->offset_within_address_space); @@ -947,6 +1166,10 @@ static void vfio_listener_region_del(MemoryListener *listener, pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } else if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_unregister_ram_discard_listener(container, section); + /* Unregistering will trigger an unmap. */ + try_unmap = false; } if (try_unmap) { @@ -1108,6 +1331,49 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_unlock(); } +static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, + void *opaque) +{ + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + VFIORamDiscardListener *vrdl = opaque; + + /* + * Sync the whole mapped region (spanning multiple individual mappings) + * in one go. + */ + return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); +} + +static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + } + + /* + * We only want/can synchronize the bitmap for actually mapped parts - + * which correspond to populated parts. Replay all populated parts. + */ + return ram_discard_manager_replay_populated(rdm, section, + vfio_ram_discard_get_dirty_bitmap, + &vrdl); +} + static int vfio_sync_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { @@ -1139,6 +1405,8 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, } } return 0; + } else if (memory_region_has_ram_discard_manager(section->mr)) { + return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); } ram_addr = memory_region_get_ram_addr(section->mr) + @@ -1732,15 +2000,25 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * new memory, it will not yet set ram_block_discard_set_required() and * therefore, neither stops us here or deals with the sudden memory * consumption of inflated memory. + * + * We do support discarding of memory coordinated via the RamDiscardManager + * with some IOMMU types. vfio_ram_block_discard_disable() handles the + * details once we know which type of IOMMU we are using. */ - ret = ram_block_discard_disable(true); - if (ret) { - error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - return ret; - } QLIST_FOREACH(container, &space->containers, next) { if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, + "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, + &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); + } + return ret; + } group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); vfio_kvm_device_add_group(group); @@ -1768,14 +2046,22 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->fd = fd; container->error = NULL; container->dirty_pages_supported = false; + container->dma_max_mappings = 0; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&container->vrdl_list); ret = vfio_init_container(container, group->fd, errp); if (ret) { goto free_container_exit; } + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + goto free_container_exit; + } + switch (container->iommu_type) { case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: @@ -1798,7 +2084,10 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes); container->pgsizes = info->iova_pgsizes; + /* The default in the kernel ("dma_entry_limit") is 65535. */ + container->dma_max_mappings = 65535; if (!ret) { + vfio_get_info_dma_avail(info, &container->dma_max_mappings); vfio_get_iommu_info_migration(container, info); } g_free(info); @@ -1820,7 +2109,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (ret) { error_setg_errno(errp, errno, "failed to enable container"); ret = -errno; - goto free_container_exit; + goto enable_discards_exit; } } else { container->prereg_listener = vfio_prereg_listener; @@ -1832,7 +2121,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, ret = -1; error_propagate_prepend(errp, container->error, "RAM memory listener initialization failed: "); - goto free_container_exit; + goto enable_discards_exit; } } @@ -1845,7 +2134,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (v2) { memory_listener_unregister(&container->prereg_listener); } - goto free_container_exit; + goto enable_discards_exit; } if (v2) { @@ -1860,7 +2149,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (ret) { error_setg_errno(errp, -ret, "failed to remove existing window"); - goto free_container_exit; + goto enable_discards_exit; } } else { /* The default table uses 4K pages */ @@ -1901,6 +2190,9 @@ listener_release_exit: vfio_kvm_device_del_group(group); vfio_listener_release(container); +enable_discards_exit: + vfio_ram_block_discard_disable(container, false); + free_container_exit: g_free(container); @@ -1908,7 +2200,6 @@ close_fd_exit: close(fd); put_space_exit: - ram_block_discard_disable(false); vfio_put_address_space(space); return ret; @@ -2030,7 +2321,7 @@ void vfio_put_group(VFIOGroup *group) } if (!group->ram_block_discard_allowed) { - ram_block_discard_disable(false); + vfio_ram_block_discard_disable(group->container, false); } vfio_kvm_device_del_group(group); vfio_disconnect_container(group); @@ -2084,7 +2375,7 @@ int vfio_get_device(VFIOGroup *group, const char *name, if (!group->ram_block_discard_allowed) { group->ram_block_discard_allowed = true; - ram_block_discard_disable(false); + vfio_ram_block_discard_disable(group->container, false); } } |