/* * Copyright (c) 2019 Nutanix Inc. All rights reserved. * * Authors: Mike Cui * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Nutanix nor the names of its contributors may be * used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #ifndef LIB_VFIO_USER_DMA_H #define LIB_VFIO_USER_DMA_H /* * FIXME check whether DMA regions must be page aligned. If so then the * implementation can be greatly simpified. */ /* * This library emulates a DMA controller for a device emulation application to * perform DMA operations on a foreign memory space. * * Concepts: * - A DMA controller has its own 64-bit DMA address space. * - Foreign memory is made available to the DMA controller in linear chunks * called memory regions. * - Each memory region is backed by a file descriptor and * is registered with the DMA controllers at a unique, non-overlapping * linear span of the DMA address space. * - To perform DMA, the application should first build a scatter-gather * list (sgl) of dma_sg_t from DMA addresses. Then the sgl * can be mapped using dma_sgl_get() into the process's virtual address space * as an iovec for direct access, and unmapped using dma_sgl_put() when done. * Every region is mapped into the application's virtual address space * at registration time with R/W permissions. * dma_sgl_get() ignores all protection bits and only does lookups and * returns pointers to the previously mapped regions. dma_sgl_put() is * effectively a no-op. */ #include #ifdef DMA_MAP_PROTECTED #undef DMA_MAP_FAST #define DMA_MAP_FAST_IMPL 0 #else #define DMA_MAP_FAST_IMPL 1 #endif #include #include #include #include #include #include #include #include #include #include "libvfio-user.h" #include "common.h" #include "private.h" #define iov_end(iov) ((iov)->iov_base + (iov)->iov_len) struct vfu_ctx; struct dma_sg { vfu_dma_addr_t dma_addr; int region; uint64_t length; uint64_t offset; bool writeable; }; typedef struct { vfu_dma_info_t info; int fd; // File descriptor to mmap off_t offset; // File offset uint8_t *dirty_bitmap; // Dirty page bitmap } dma_memory_region_t; typedef struct dma_controller { int max_regions; size_t max_size; int nregions; struct vfu_ctx *vfu_ctx; size_t dirty_pgsize; // Dirty page granularity dma_memory_region_t regions[0]; } dma_controller_t; dma_controller_t * dma_controller_create(vfu_ctx_t *vfu_ctx, size_t max_regions, size_t max_size); void dma_controller_remove_all_regions(dma_controller_t *dma, vfu_dma_unregister_cb_t *dma_unregister, void *data); void dma_controller_destroy(dma_controller_t *dma); /* Registers a new memory region. * Returns: * - On success, a non-negative region number * - On failure, -1 with errno set. */ MOCK_DECLARE(int, dma_controller_add_region, dma_controller_t *dma, vfu_dma_addr_t dma_addr, uint64_t size, int fd, off_t offset, uint32_t prot); MOCK_DECLARE(int, dma_controller_remove_region, dma_controller_t *dma, vfu_dma_addr_t dma_addr, size_t size, vfu_dma_unregister_cb_t *dma_unregister, void *data); MOCK_DECLARE(void, dma_controller_unmap_region, dma_controller_t *dma, dma_memory_region_t *region); // Helper for dma_addr_to_sgl() slow path. int _dma_addr_sg_split(const dma_controller_t *dma, vfu_dma_addr_t dma_addr, uint64_t len, dma_sg_t *sg, int max_nr_sgs, int prot); /* Convert a start address and length to its containing page numbers. */ static inline void range_to_pages(size_t start, size_t len, size_t pgsize, size_t *pgstart, size_t *pgend) { *pgstart = start / pgsize; *pgend = ROUND_UP(start + len, pgsize) / pgsize; } /* Given a bit position, return the containing byte. */ static inline size_t bit_to_u8(size_t val) { return val / (CHAR_BIT); } /* Return a value modulo the bitsize of a uint8_t. */ static inline size_t bit_to_u8off(size_t val) { return val % (CHAR_BIT); } static inline void _dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region, dma_sg_t *sg) { size_t index; size_t end; size_t pgstart; size_t pgend; size_t i; assert(dma != NULL); assert(region != NULL); assert(sg != NULL); assert(region->dirty_bitmap != NULL); range_to_pages(sg->offset, sg->length, dma->dirty_pgsize, &pgstart, &pgend); index = bit_to_u8(pgstart); end = bit_to_u8(pgend) + !!(bit_to_u8off(pgend)); for (i = index; i < end; i++) { uint8_t bm = ~0; /* Mask off any pages in the first u8 that aren't in the range. */ if (i == index && bit_to_u8off(pgstart) != 0) { bm &= ~((1 << bit_to_u8off(pgstart)) - 1); } /* Mask off any pages in the last u8 that aren't in the range. */ if (i == end - 1 && bit_to_u8off(pgend) != 0) { bm &= ((1 << bit_to_u8off(pgend)) - 1); } __atomic_or_fetch(®ion->dirty_bitmap[i], bm, __ATOMIC_SEQ_CST); } } static inline int dma_init_sg(const dma_controller_t *dma, dma_sg_t *sg, vfu_dma_addr_t dma_addr, uint64_t len, int prot, int region_index) { const dma_memory_region_t *const region = &dma->regions[region_index]; if ((prot & PROT_WRITE) && !(region->info.prot & PROT_WRITE)) { vfu_log(dma->vfu_ctx, LOG_DEBUG, "read-only region"); return ERROR_INT(EACCES); } sg->dma_addr = region->info.iova.iov_base; sg->region = region_index; sg->offset = dma_addr - region->info.iova.iov_base; sg->length = len; sg->writeable = prot & PROT_WRITE; return 0; } /* Takes a linear dma address span and returns a sg list suitable for DMA. * A single linear dma address span may need to be split into multiple * scatter gather regions due to limitations of how memory can be mapped. * * Returns: * - On success, number of scatter gather entries created. * - On failure: * -1 if * - the DMA address span is invalid * - protection violation (errno=EACCES) * (-x - 1) if @max_nr_sgs is too small, where x is the number of sg entries * necessary to complete this request. */ static inline int dma_addr_to_sgl(const dma_controller_t *dma, vfu_dma_addr_t dma_addr, size_t len, dma_sg_t *sgl, size_t max_nr_sgs, int prot) { static __thread int region_hint; int cnt, ret; const dma_memory_region_t *const region = &dma->regions[region_hint]; const void *region_end = iov_end(®ion->info.iova); // Fast path: single region. if (likely(max_nr_sgs > 0 && len > 0 && dma_addr >= region->info.iova.iov_base && dma_addr + len <= region_end && region_hint < dma->nregions)) { ret = dma_init_sg(dma, sgl, dma_addr, len, prot, region_hint); if (ret < 0) { return ret; } return 1; } // Slow path: search through regions. cnt = _dma_addr_sg_split(dma, dma_addr, len, sgl, max_nr_sgs, prot); if (likely(cnt > 0)) { region_hint = sgl[0].region; } return cnt; } static inline int dma_sgl_get(dma_controller_t *dma, dma_sg_t *sgl, struct iovec *iov, size_t cnt) { dma_memory_region_t *region; dma_sg_t *sg; assert(dma != NULL); assert(sgl != NULL); assert(iov != NULL); assert(cnt > 0); sg = sgl; do { if (sg->region >= dma->nregions) { return ERROR_INT(EINVAL); } region = &dma->regions[sg->region]; if (region->info.vaddr == NULL) { return ERROR_INT(EFAULT); } #ifdef DEBUG_SGL vfu_log(dma->vfu_ctx, LOG_DEBUG, "map %p-%p", sg->dma_addr + sg->offset, sg->dma_addr + sg->offset + sg->length); #endif iov->iov_base = region->info.vaddr + sg->offset; iov->iov_len = sg->length; sg++; iov++; } while (--cnt > 0); return 0; } static inline void dma_sgl_mark_dirty(dma_controller_t *dma, dma_sg_t *sgl, size_t cnt) { dma_memory_region_t *region; dma_sg_t *sg; assert(dma != NULL); assert(sgl != NULL); assert(cnt > 0); sg = sgl; do { if (sg->region >= dma->nregions) { return; } region = &dma->regions[sg->region]; if (sg->writeable) { if (dma->dirty_pgsize > 0) { _dma_mark_dirty(dma, region, sg); } } #ifdef DEBUG_SGL vfu_log(dma->vfu_ctx, LOG_DEBUG, "mark dirty %p-%p", sg->dma_addr + sg->offset, sg->dma_addr + sg->offset + sg->length); #endif sg++; } while (--cnt > 0); } static inline void dma_sgl_put(dma_controller_t *dma, dma_sg_t *sgl, size_t cnt) { dma_memory_region_t *region; dma_sg_t *sg; assert(dma != NULL); assert(sgl != NULL); assert(cnt > 0); sg = sgl; do { if (sg->region >= dma->nregions) { return; } region = &dma->regions[sg->region]; if (sg->writeable) { if (dma->dirty_pgsize > 0) { _dma_mark_dirty(dma, region, sg); } } #ifdef DEBUG_SGL vfu_log(dma->vfu_ctx, LOG_DEBUG, "unmap %p-%p", sg->dma_addr + sg->offset, sg->dma_addr + sg->offset + sg->length); #endif sg++; } while (--cnt > 0); } int dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize); void dma_controller_dirty_page_logging_stop(dma_controller_t *dma); int dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr, uint64_t len, size_t pgsize, size_t size, char *bitmap); bool dma_sg_is_mappable(const dma_controller_t *dma, const dma_sg_t *sg); #endif /* LIB_VFIO_USER_DMA_H */ /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */