From 1a957cf9c4637abe4b7d67a91312a2565306641e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 3 Feb 2018 10:39:33 -0500 Subject: coroutine-lock: convert CoQueue to use QemuLockable There are cases in which a queued coroutine must be restarted from non-coroutine context (with qemu_co_enter_next). In this cases, qemu_co_enter_next also needs to be thread-safe, but it cannot use a CoMutex and so cannot qemu_co_queue_wait. Use QemuLockable so that the CoQueue can interchangeably use CoMutex or QemuMutex. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Message-Id: <20180203153935.8056-4-pbonzini@redhat.com> Reviewed-by: Fam Zheng Signed-off-by: Fam Zheng --- util/qemu-coroutine-lock.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'util') diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 846ff91..2a66fc1 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -40,13 +40,13 @@ void qemu_co_queue_init(CoQueue *queue) QSIMPLEQ_INIT(&queue->entries); } -void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex) +void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock) { Coroutine *self = qemu_coroutine_self(); QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next); - if (mutex) { - qemu_co_mutex_unlock(mutex); + if (lock) { + qemu_lockable_unlock(lock); } /* There is no race condition here. Other threads will call @@ -60,9 +60,11 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex) /* TODO: OSv implements wait morphing here, where the wakeup * primitive automatically places the woken coroutine on the * mutex's queue. This avoids the thundering herd effect. + * This could be implemented for CoMutexes, but not really for + * other cases of QemuLockable. */ - if (mutex) { - qemu_co_mutex_lock(mutex); + if (lock) { + qemu_lockable_lock(lock); } } -- cgit v1.1 From 5261dd7b0106a88c9b323607136b1dfa1d7db689 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 3 Feb 2018 10:39:34 -0500 Subject: coroutine-lock: make qemu_co_enter_next thread-safe qemu_co_queue_next does not need to release and re-acquire the mutex, because the queued coroutine does not run immediately. However, this does not hold for qemu_co_enter_next. Now that qemu_co_queue_wait can synchronize (via QemuLockable) with code that is not running in coroutine context, it's important that code using qemu_co_enter_next can easily use a standardized locking idiom. First of all, qemu_co_enter_next must use aio_co_wake to restart the coroutine. Second, the function gains a second argument, a QemuLockable*, and the comments of qemu_co_queue_next and qemu_co_queue_restart_all are adjusted to clarify the difference. Signed-off-by: Paolo Bonzini Message-Id: <20180203153935.8056-5-pbonzini@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Fam Zheng Signed-off-by: Fam Zheng --- util/qemu-coroutine-lock.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'util') diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 2a66fc1..78fb79a 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -132,7 +132,7 @@ void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue) qemu_co_queue_do_restart(queue, false); } -bool qemu_co_enter_next(CoQueue *queue) +bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock) { Coroutine *next; @@ -142,7 +142,13 @@ bool qemu_co_enter_next(CoQueue *queue) } QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next); - qemu_coroutine_enter(next); + if (lock) { + qemu_lockable_unlock(lock); + } + aio_co_wake(next); + if (lock) { + qemu_lockable_lock(lock); + } return true; } -- cgit v1.1 From 418026ca43bc2626db092d7558258f9594366f28 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 16 Jan 2018 14:08:54 +0800 Subject: util: Introduce vfio helpers This is a library to manage the host vfio interface, which could be used to implement userspace device driver code in QEMU such as NVMe or net controllers. Signed-off-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Message-Id: <20180116060901.17413-3-famz@redhat.com> Signed-off-by: Fam Zheng --- util/Makefile.objs | 1 + util/trace-events | 11 + util/vfio-helpers.c | 727 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 739 insertions(+) create mode 100644 util/vfio-helpers.c (limited to 'util') diff --git a/util/Makefile.objs b/util/Makefile.objs index 2973b0a..3fb6116 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -46,3 +46,4 @@ util-obj-y += qht.o util-obj-y += range.o util-obj-y += stats64.o util-obj-y += systemd.o +util-obj-$(CONFIG_LINUX) += vfio-helpers.o diff --git a/util/trace-events b/util/trace-events index 515e625..4822434 100644 --- a/util/trace-events +++ b/util/trace-events @@ -60,3 +60,14 @@ lockcnt_futex_wake(const void *lockcnt) "lockcnt %p waking up one waiter" qemu_mutex_lock(void *mutex, const char *file, const int line) "waiting on mutex %p (%s:%d)" qemu_mutex_locked(void *mutex, const char *file, const int line) "taken mutex %p (%s:%d)" qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex %p (%s:%d)" + +# util/vfio-helpers.c +qemu_vfio_dma_reset_temporary(void *s) "s %p" +qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_find_mapping(void *s, void *p) "s %p host %p" +qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size %zu index %d iova 0x%"PRIx64 +qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size %zu iova 0x%"PRIx64 +qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size %zu temporary %d iova %p" +qemu_vfio_dma_map_invalid(void *s, void *mapping_host, size_t mapping_size, void *host, size_t size) "s %p mapping %p %zu requested %p %zu" +qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p" diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c new file mode 100644 index 0000000..f478b68 --- /dev/null +++ b/util/vfio-helpers.c @@ -0,0 +1,727 @@ +/* + * VFIO utility + * + * Copyright 2016 - 2018 Red Hat, Inc. + * + * Authors: + * Fam Zheng + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include +#include +#include "qapi/error.h" +#include "exec/ramlist.h" +#include "exec/cpu-common.h" +#include "trace.h" +#include "qemu/queue.h" +#include "qemu/error-report.h" +#include "standard-headers/linux/pci_regs.h" +#include "qemu/event_notifier.h" +#include "qemu/vfio-helpers.h" +#include "trace.h" + +#define QEMU_VFIO_DEBUG 0 + +#define QEMU_VFIO_IOVA_MIN 0x10000ULL +/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, + * we can use a runtime limit; alternatively it's also possible to do platform + * specific detection by reading sysfs entries. Until then, 39 is a safe bet. + **/ +#define QEMU_VFIO_IOVA_MAX (1ULL << 39) + +typedef struct { + /* Page aligned addr. */ + void *host; + size_t size; + uint64_t iova; +} IOVAMapping; + +struct QEMUVFIOState { + QemuMutex lock; + + /* These fields are protected by BQL */ + int container; + int group; + int device; + RAMBlockNotifier ram_notifier; + struct vfio_region_info config_region_info, bar_region_info[6]; + + /* These fields are protected by @lock */ + /* VFIO's IO virtual address space is managed by splitting into a few + * sections: + * + * --------------- <= 0 + * |xxxxxxxxxxxxx| + * |-------------| <= QEMU_VFIO_IOVA_MIN + * | | + * | Fixed | + * | | + * |-------------| <= low_water_mark + * | | + * | Free | + * | | + * |-------------| <= high_water_mark + * | | + * | Temp | + * | | + * |-------------| <= QEMU_VFIO_IOVA_MAX + * |xxxxxxxxxxxxx| + * |xxxxxxxxxxxxx| + * --------------- + * + * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; + * + * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of + * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be + * reclaimed - low_water_mark never shrinks; + * + * - IOVAs in range [low_water_mark, high_water_mark) are free; + * + * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile + * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area + * is recycled. The caller should make sure I/O's depending on these + * mappings are completed before calling. + **/ + uint64_t low_water_mark; + uint64_t high_water_mark; + IOVAMapping *mappings; + int nr_mappings; +}; + +/** + * Find group file by PCI device address as specified @device, and return the + * path. The returned string is owned by caller and should be g_free'ed later. + */ +static char *sysfs_find_group_file(const char *device, Error **errp) +{ + char *sysfs_link; + char *sysfs_group; + char *p; + char *path = NULL; + + sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); + sysfs_group = g_malloc(PATH_MAX); + if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { + error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); + goto out; + } + p = strrchr(sysfs_group, '/'); + if (!p) { + error_setg(errp, "Failed to find iommu group number"); + goto out; + } + + path = g_strdup_printf("/dev/vfio/%s", p + 1); +out: + g_free(sysfs_link); + g_free(sysfs_group); + return path; +} + +static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) +{ + assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); +} + +static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) +{ + assert_bar_index_valid(s, index); + s->bar_region_info[index] = (struct vfio_region_info) { + .index = VFIO_PCI_BAR0_REGION_INDEX + index, + .argsz = sizeof(struct vfio_region_info), + }; + if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { + error_setg_errno(errp, errno, "Failed to get BAR region info"); + return -errno; + } + + return 0; +} + +/** + * Map a PCI bar area. + */ +void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, + uint64_t offset, uint64_t size, + Error **errp) +{ + void *p; + assert_bar_index_valid(s, index); + p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), + PROT_READ | PROT_WRITE, MAP_SHARED, + s->device, s->bar_region_info[index].offset + offset); + if (p == MAP_FAILED) { + error_setg_errno(errp, errno, "Failed to map BAR region"); + p = NULL; + } + return p; +} + +/** + * Unmap a PCI bar area. + */ +void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, + uint64_t offset, uint64_t size) +{ + if (bar) { + munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); + } +} + +/** + * Initialize device IRQ with @irq_type and and register an event notifier. + */ +int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, + int irq_type, Error **errp) +{ + int r; + struct vfio_irq_set *irq_set; + size_t irq_set_size; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + + irq_info.index = irq_type; + if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { + error_setg_errno(errp, errno, "Failed to get device interrupt info"); + return -errno; + } + if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { + error_setg(errp, "Device interrupt doesn't support eventfd"); + return -EINVAL; + } + + irq_set_size = sizeof(*irq_set) + sizeof(int); + irq_set = g_malloc0(irq_set_size); + + /* Get to a known IRQ state */ + *irq_set = (struct vfio_irq_set) { + .argsz = irq_set_size, + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = irq_info.index, + .start = 0, + .count = 1, + }; + + *(int *)&irq_set->data = event_notifier_get_fd(e); + r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (r) { + error_setg_errno(errp, errno, "Failed to setup device interrupt"); + return -errno; + } + return 0; +} + +static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, + int size, int ofs) +{ + int ret; + + do { + ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); + } while (ret == -1 && errno == EINTR); + return ret == size ? 0 : -errno; +} + +static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) +{ + int ret; + + do { + ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); + } while (ret == -1 && errno == EINTR); + return ret == size ? 0 : -errno; +} + +static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, + Error **errp) +{ + int ret; + int i; + uint16_t pci_cmd; + struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + char *group_file = NULL; + + /* Create a new container */ + s->container = open("/dev/vfio/vfio", O_RDWR); + + if (s->container == -1) { + error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); + return -errno; + } + if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { + error_setg(errp, "Invalid VFIO version"); + ret = -EINVAL; + goto fail_container; + } + + if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { + error_setg_errno(errp, errno, "VFIO IOMMU check failed"); + ret = -EINVAL; + goto fail_container; + } + + /* Open the group */ + group_file = sysfs_find_group_file(device, errp); + if (!group_file) { + ret = -EINVAL; + goto fail_container; + } + + s->group = open(group_file, O_RDWR); + if (s->group == -1) { + error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", + group_file); + g_free(group_file); + ret = -errno; + goto fail_container; + } + g_free(group_file); + + /* Test the group is viable and available */ + if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { + error_setg_errno(errp, errno, "Failed to get VFIO group status"); + ret = -errno; + goto fail; + } + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + error_setg(errp, "VFIO group is not viable"); + ret = -EINVAL; + goto fail; + } + + /* Add the group to the container */ + if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { + error_setg_errno(errp, errno, "Failed to add group to VFIO container"); + ret = -errno; + goto fail; + } + + /* Enable the IOMMU model we want */ + if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { + error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); + ret = -errno; + goto fail; + } + + /* Get additional IOMMU info */ + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) { + error_setg_errno(errp, errno, "Failed to get IOMMU info"); + ret = -errno; + goto fail; + } + + s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); + + if (s->device < 0) { + error_setg_errno(errp, errno, "Failed to get device fd"); + ret = -errno; + goto fail; + } + + /* Test and setup the device */ + if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { + error_setg_errno(errp, errno, "Failed to get device info"); + ret = -errno; + goto fail; + } + + if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { + error_setg(errp, "Invalid device regions"); + ret = -EINVAL; + goto fail; + } + + s->config_region_info = (struct vfio_region_info) { + .index = VFIO_PCI_CONFIG_REGION_INDEX, + .argsz = sizeof(struct vfio_region_info), + }; + if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { + error_setg_errno(errp, errno, "Failed to get config region info"); + ret = -errno; + goto fail; + } + + for (i = 0; i < 6; i++) { + ret = qemu_vfio_pci_init_bar(s, i, errp); + if (ret) { + goto fail; + } + } + + /* Enable bus master */ + ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); + if (ret) { + goto fail; + } + pci_cmd |= PCI_COMMAND_MASTER; + ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); + if (ret) { + goto fail; + } + return 0; +fail: + close(s->group); +fail_container: + close(s->container); + return ret; +} + +static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, + void *host, size_t size) +{ + QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); + trace_qemu_vfio_ram_block_added(s, host, size); + qemu_vfio_dma_map(s, host, size, false, NULL); +} + +static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, + void *host, size_t size) +{ + QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); + if (host) { + trace_qemu_vfio_ram_block_removed(s, host, size); + qemu_vfio_dma_unmap(s, host); + } +} + +static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, + void *opaque) +{ + int ret; + QEMUVFIOState *s = opaque; + + if (!host_addr) { + return 0; + } + ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); + if (ret) { + fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n", + host_addr, (uint64_t)length); + } + return 0; +} + +static void qemu_vfio_open_common(QEMUVFIOState *s) +{ + s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; + s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; + ram_block_notifier_add(&s->ram_notifier); + s->low_water_mark = QEMU_VFIO_IOVA_MIN; + s->high_water_mark = QEMU_VFIO_IOVA_MAX; + qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); + qemu_mutex_init(&s->lock); +} + +/** + * Open a PCI device, e.g. "0000:00:01.0". + */ +QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) +{ + int r; + QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); + + r = qemu_vfio_init_pci(s, device, errp); + if (r) { + g_free(s); + return NULL; + } + qemu_vfio_open_common(s); + return s; +} + +static void qemu_vfio_dump_mapping(IOVAMapping *m) +{ + if (QEMU_VFIO_DEBUG) { + printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, + (uint64_t)m->size, (uint64_t)m->iova); + } +} + +static void qemu_vfio_dump_mappings(QEMUVFIOState *s) +{ + int i; + + if (QEMU_VFIO_DEBUG) { + printf("vfio mappings\n"); + for (i = 0; i < s->nr_mappings; ++i) { + qemu_vfio_dump_mapping(&s->mappings[i]); + } + } +} + +/** + * Find the mapping entry that contains [host, host + size) and set @index to + * the position. If no entry contains it, @index is the position _after_ which + * to insert the new mapping. IOW, it is the index of the largest element that + * is smaller than @host, or -1 if no entry is. + */ +static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, + int *index) +{ + IOVAMapping *p = s->mappings; + IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; + IOVAMapping *mid; + trace_qemu_vfio_find_mapping(s, host); + if (!p) { + *index = -1; + return NULL; + } + while (true) { + mid = p + (q - p) / 2; + if (mid == p) { + break; + } + if (mid->host > host) { + q = mid; + } else if (mid->host < host) { + p = mid; + } else { + break; + } + } + if (mid->host > host) { + mid--; + } else if (mid < &s->mappings[s->nr_mappings - 1] + && (mid + 1)->host <= host) { + mid++; + } + *index = mid - &s->mappings[0]; + if (mid >= &s->mappings[0] && + mid->host <= host && mid->host + mid->size > host) { + assert(mid < &s->mappings[s->nr_mappings]); + return mid; + } + /* At this point *index + 1 is the right position to insert the new + * mapping.*/ + return NULL; +} + +/** + * Allocate IOVA and and create a new mapping record and insert it in @s. + */ +static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, + void *host, size_t size, + int index, uint64_t iova) +{ + int shift; + IOVAMapping m = {.host = host, .size = size, .iova = iova}; + IOVAMapping *insert; + + assert(QEMU_IS_ALIGNED(size, getpagesize())); + assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize())); + assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize())); + trace_qemu_vfio_new_mapping(s, host, size, index, iova); + + assert(index >= 0); + s->nr_mappings++; + s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), + s->nr_mappings); + insert = &s->mappings[index]; + shift = s->nr_mappings - index - 1; + if (shift) { + memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); + } + *insert = m; + return insert; +} + +/* Do the DMA mapping with VFIO. */ +static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, + uint64_t iova) +{ + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .iova = iova, + .vaddr = (uintptr_t)host, + .size = size, + }; + trace_qemu_vfio_do_mapping(s, host, size, iova); + + if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { + error_report("VFIO_MAP_DMA: %d", -errno); + return -errno; + } + return 0; +} + +/** + * Undo the DMA mapping from @s with VFIO, and remove from mapping list. + */ +static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, + Error **errp) +{ + int index; + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = mapping->iova, + .size = mapping->size, + }; + + index = mapping - s->mappings; + assert(mapping->size > 0); + assert(QEMU_IS_ALIGNED(mapping->size, getpagesize())); + assert(index >= 0 && index < s->nr_mappings); + if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno); + } + memmove(mapping, &s->mappings[index + 1], + sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); + s->nr_mappings--; + s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), + s->nr_mappings); +} + +/* Check if the mapping list is (ascending) ordered. */ +static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) +{ + int i; + if (QEMU_VFIO_DEBUG) { + for (i = 0; i < s->nr_mappings - 1; ++i) { + if (!(s->mappings[i].host < s->mappings[i + 1].host)) { + fprintf(stderr, "item %d not sorted!\n", i); + qemu_vfio_dump_mappings(s); + return false; + } + if (!(s->mappings[i].host + s->mappings[i].size <= + s->mappings[i + 1].host)) { + fprintf(stderr, "item %d overlap with next!\n", i); + qemu_vfio_dump_mappings(s); + return false; + } + } + } + return true; +} + +/* Map [host, host + size) area into a contiguous IOVA address space, and store + * the result in @iova if not NULL. The caller need to make sure the area is + * aligned to page size, and mustn't overlap with existing mapping areas (split + * mapping status within this area is not allowed). + */ +int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, + bool temporary, uint64_t *iova) +{ + int ret = 0; + int index; + IOVAMapping *mapping; + uint64_t iova0; + + assert(QEMU_PTR_IS_ALIGNED(host, getpagesize())); + assert(QEMU_IS_ALIGNED(size, getpagesize())); + trace_qemu_vfio_dma_map(s, host, size, temporary, iova); + qemu_mutex_lock(&s->lock); + mapping = qemu_vfio_find_mapping(s, host, &index); + if (mapping) { + iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); + } else { + if (s->high_water_mark - s->low_water_mark + 1 < size) { + ret = -ENOMEM; + goto out; + } + if (!temporary) { + iova0 = s->low_water_mark; + mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); + if (!mapping) { + ret = -ENOMEM; + goto out; + } + assert(qemu_vfio_verify_mappings(s)); + ret = qemu_vfio_do_mapping(s, host, size, iova0); + if (ret) { + qemu_vfio_undo_mapping(s, mapping, NULL); + goto out; + } + s->low_water_mark += size; + qemu_vfio_dump_mappings(s); + } else { + iova0 = s->high_water_mark - size; + ret = qemu_vfio_do_mapping(s, host, size, iova0); + if (ret) { + goto out; + } + s->high_water_mark -= size; + } + } + if (iova) { + *iova = iova0; + } +out: + qemu_mutex_unlock(&s->lock); + return ret; +} + +/* Reset the high watermark and free all "temporary" mappings. */ +int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = s->high_water_mark, + .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, + }; + trace_qemu_vfio_dma_reset_temporary(s); + qemu_mutex_lock(&s->lock); + if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_report("VFIO_UNMAP_DMA: %d", -errno); + qemu_mutex_unlock(&s->lock); + return -errno; + } + s->high_water_mark = QEMU_VFIO_IOVA_MAX; + qemu_mutex_unlock(&s->lock); + return 0; +} + +/* Unmapping the whole area that was previously mapped with + * qemu_vfio_dma_map(). */ +void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) +{ + int index = 0; + IOVAMapping *m; + + if (!host) { + return; + } + + trace_qemu_vfio_dma_unmap(s, host); + qemu_mutex_lock(&s->lock); + m = qemu_vfio_find_mapping(s, host, &index); + if (!m) { + goto out; + } + qemu_vfio_undo_mapping(s, m, NULL); +out: + qemu_mutex_unlock(&s->lock); +} + +static void qemu_vfio_reset(QEMUVFIOState *s) +{ + ioctl(s->device, VFIO_DEVICE_RESET); +} + +/* Close and free the VFIO resources. */ +void qemu_vfio_close(QEMUVFIOState *s) +{ + int i; + + if (!s) { + return; + } + for (i = 0; i < s->nr_mappings; ++i) { + qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); + } + ram_block_notifier_remove(&s->ram_notifier); + qemu_vfio_reset(s); + close(s->device); + close(s->group); + close(s->container); +} -- cgit v1.1