/* * Sample server to be tested with samples/client.c * * Copyright (c) 2020, Nutanix Inc. All rights reserved. * Author: Thanos Makatos * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Nutanix nor the names of its contributors may be * used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "libvfio-user.h" #include "rte_hash_crc.h" struct dma_regions { struct iovec iova; uint32_t prot; }; #define NR_DMA_REGIONS 96 struct server_data { time_t bar0; void *bar1; size_t bar1_size; struct dma_regions regions[NR_DMA_REGIONS]; struct { uint64_t bytes_transferred; vfu_migr_state_t state; } migration; }; static void _log(vfu_ctx_t *vfu_ctx UNUSED, UNUSED int level, char const *msg) { fprintf(stderr, "server[%d]: %s\n", getpid(), msg); } static int arm_timer(vfu_ctx_t *vfu_ctx, time_t t) { struct itimerval new = {.it_value.tv_sec = t - time(NULL) }; vfu_log(vfu_ctx, LOG_DEBUG, "arming timer to trigger in %ld seconds", new.it_value.tv_sec); if (setitimer(ITIMER_REAL, &new, NULL) != 0) { vfu_log(vfu_ctx, LOG_ERR, "failed to arm timer: %m"); return -1; } return 0; } static ssize_t bar0_access(vfu_ctx_t *vfu_ctx, char * const buf, size_t count, loff_t offset, const bool is_write) { struct server_data *server_data = vfu_get_private(vfu_ctx); if (count != sizeof(time_t) || offset != 0) { vfu_log(vfu_ctx, LOG_ERR, "bad BAR0 access %#llx-%#llx", (unsigned long long)offset, (unsigned long long)offset + count - 1); errno = EINVAL; return -1; } if (is_write) { if (server_data->migration.state == VFU_MIGR_STATE_RUNNING) { int ret = arm_timer(vfu_ctx, *(time_t*)buf); if (ret < 0) { return ret; } } memcpy(&server_data->bar0, buf, count); } else { time_t delta = time(NULL) - server_data->bar0; memcpy(buf, &delta, count); } return count; } static ssize_t bar1_access(vfu_ctx_t *vfu_ctx, char * const buf, size_t count, loff_t offset, const bool is_write) { struct server_data *server_data = vfu_get_private(vfu_ctx); if (offset + count > server_data->bar1_size) { vfu_log(vfu_ctx, LOG_ERR, "bad BAR1 access %#llx-%#llx", (unsigned long long)offset, (unsigned long long)offset + count - 1); errno = EINVAL; return -1; } if (is_write) { memcpy(server_data->bar1 + offset, buf, count); } else { memcpy(buf, server_data->bar1, count); } return count; } bool irq_triggered = false; static void _sa_handler(int signum) { int _errno = errno; if (signum == SIGALRM) { irq_triggered = true; } errno = _errno; } static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) { struct server_data *server_data = vfu_get_private(vfu_ctx); int idx; for (idx = 0; idx < NR_DMA_REGIONS; idx++) { if (server_data->regions[idx].iova.iov_base == NULL && server_data->regions[idx].iova.iov_len == 0) break; } if (idx >= NR_DMA_REGIONS) { errx(EXIT_FAILURE, "Failed to add dma region, slots full"); } server_data->regions[idx].iova = info->iova; server_data->regions[idx].prot = info->prot; } static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) { struct server_data *server_data = vfu_get_private(vfu_ctx); int idx; for (idx = 0; idx < NR_DMA_REGIONS; idx++) { if (server_data->regions[idx].iova.iov_len == info->iova.iov_len && server_data->regions[idx].iova.iov_base == info->iova.iov_base) { server_data->regions[idx].iova.iov_base = NULL; server_data->regions[idx].iova.iov_len = 0; } } } /* * FIXME this function does DMA write/read using messages. This should be done * on a region that is not memory mappable or an area of a region that is not * sparsely memory mappable. We should also have a test where the server does * DMA directly on the client memory. */ static void do_dma_io(vfu_ctx_t *vfu_ctx, struct server_data *server_data, int region, bool use_messages) { const int size = 1024; const int count = 4; unsigned char buf[size * count]; uint32_t crc1, crc2; dma_sg_t *sg; void *addr; int ret; sg = alloca(dma_sg_size()); assert(vfu_ctx != NULL); struct iovec iov = {0}; /* Write some data, chunked into multiple calls to exercise offsets. */ for (int i = 0; i < count; ++i) { addr = server_data->regions[region].iova.iov_base + i * size; ret = vfu_addr_to_sgl(vfu_ctx, (vfu_dma_addr_t)addr, size, sg, 1, PROT_WRITE); if (ret < 0) { err(EXIT_FAILURE, "failed to map %p-%p", addr, addr + size - 1); } memset(&buf[i * size], 'A' + i, size); if (use_messages) { vfu_log(vfu_ctx, LOG_DEBUG, "%s: MESSAGE WRITE addr %p size %d", __func__, addr, size); ret = vfu_sgl_write(vfu_ctx, sg, 1, &buf[i * size]); if (ret < 0) { err(EXIT_FAILURE, "vfu_sgl_write failed"); } } else { vfu_log(vfu_ctx, LOG_DEBUG, "%s: DIRECT WRITE addr %p size %d", __func__, addr, size); ret = vfu_sgl_get(vfu_ctx, sg, &iov, 1, 0); if (ret < 0) { err(EXIT_FAILURE, "vfu_sgl_get failed"); } assert(iov.iov_len == (size_t)size); memcpy(iov.iov_base, &buf[i * size], size); /* * When directly writing to client memory the server is responsible * for tracking dirty pages. We assert that all dirty writes are * within the first page of region 1. In fact, all regions are only * one page in size. * * Note: this is not strictly necessary in this example, since we * later call `vfu_sgl_put`, which marks pages dirty if the SGL was * acquired with `PROT_WRITE`. However, `vfu_sgl_mark_dirty` is * useful in cases where the server needs to mark guest memory dirty * without releasing the memory with `vfu_sgl_put`. */ vfu_sgl_mark_dirty(vfu_ctx, sg, 1); assert(region == 1); assert(i * size < (int)PAGE_SIZE); vfu_sgl_put(vfu_ctx, sg, &iov, 1); } } crc1 = rte_hash_crc(buf, sizeof(buf), 0); /* Read the data back at double the chunk size. */ memset(buf, 0, sizeof(buf)); for (int i = 0; i < count; i += 2) { addr = server_data->regions[region].iova.iov_base + i * size; ret = vfu_addr_to_sgl(vfu_ctx, (vfu_dma_addr_t)addr, size * 2, sg, 1, PROT_READ); if (ret < 0) { err(EXIT_FAILURE, "failed to map %p-%p", addr, addr + 2 * size - 1); } if (use_messages) { vfu_log(vfu_ctx, LOG_DEBUG, "%s: MESSAGE READ addr %p size %d", __func__, addr, 2 * size); ret = vfu_sgl_read(vfu_ctx, sg, 1, &buf[i * size]); if (ret < 0) { err(EXIT_FAILURE, "vfu_sgl_read failed"); } } else { vfu_log(vfu_ctx, LOG_DEBUG, "%s: DIRECT READ addr %p size %d", __func__, addr, 2 * size); ret = vfu_sgl_get(vfu_ctx, sg, &iov, 1, 0); if (ret < 0) { err(EXIT_FAILURE, "vfu_sgl_get failed"); } assert(iov.iov_len == 2 * (size_t)size); memcpy(&buf[i * size], iov.iov_base, 2 * size); vfu_sgl_put(vfu_ctx, sg, &iov, 1); } } crc2 = rte_hash_crc(buf, sizeof(buf), 0); if (crc1 != crc2) { errx(EXIT_FAILURE, "DMA write and DMA read mismatch"); } else { vfu_log(vfu_ctx, LOG_DEBUG, "%s: %s success", __func__, use_messages ? "MESSAGE" : "DIRECT"); } } static int device_reset(vfu_ctx_t *vfu_ctx UNUSED, vfu_reset_type_t type UNUSED) { vfu_log(vfu_ctx, LOG_DEBUG, "device reset callback"); return 0; } static int migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) { struct server_data *server_data = vfu_get_private(vfu_ctx); int ret; struct itimerval new = { { 0 }, }; vfu_log(vfu_ctx, LOG_DEBUG, "migration: transition to device state %d", state); switch (state) { case VFU_MIGR_STATE_STOP_AND_COPY: vfu_log(vfu_ctx, LOG_DEBUG, "disable timer"); if (setitimer(ITIMER_REAL, &new, NULL) != 0) { err(EXIT_FAILURE, "failed to disable timer"); } server_data->migration.bytes_transferred = 0; break; case VFU_MIGR_STATE_PRE_COPY: server_data->migration.bytes_transferred = 0; break; case VFU_MIGR_STATE_STOP: /* FIXME should gracefully fail */ if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) { assert(server_data->migration.bytes_transferred == server_data->bar1_size + sizeof(time_t)); } break; case VFU_MIGR_STATE_RESUME: server_data->migration.bytes_transferred = 0; break; case VFU_MIGR_STATE_RUNNING: assert(server_data->migration.bytes_transferred == server_data->bar1_size + sizeof(time_t)); ret = arm_timer(vfu_ctx, server_data->bar0); if (ret < 0) { return ret; } break; default: assert(false); /* FIXME */ } server_data->migration.state = state; return 0; } static ssize_t migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t size) { struct server_data *server_data = vfu_get_private(vfu_ctx); /* * If in pre-copy state we copy BAR1, if in stop-and-copy state we copy * both BAR1 and BAR0. Since we always copy BAR1 in the stop-and-copy state, * copying BAR1 in the pre-copy state is pointless. Fixing this requires * more complex state tracking which exceeds the scope of this sample. */ uint32_t total_to_read = server_data->bar1_size; if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) { total_to_read += sizeof(server_data->bar0); } if (server_data->migration.bytes_transferred == total_to_read || size == 0) { vfu_log(vfu_ctx, LOG_DEBUG, "no data left to read"); return 0; } uint32_t read_start = server_data->migration.bytes_transferred; uint32_t read_end = MIN(read_start + size, total_to_read); assert(read_end > read_start); uint32_t bytes_read = read_end - read_start; uint32_t length_in_bar1 = 0; uint32_t length_in_bar0 = 0; /* read bar1, if any */ if (read_start < server_data->bar1_size) { length_in_bar1 = MIN(bytes_read, server_data->bar1_size - read_start); memcpy(buf, server_data->bar1 + read_start, length_in_bar1); read_start += length_in_bar1; } /* read bar0, if any */ if (read_end > server_data->bar1_size) { length_in_bar0 = read_end - read_start; read_start -= server_data->bar1_size; memcpy(buf + length_in_bar1, (char *)&server_data->bar0 + read_start, length_in_bar0); } server_data->migration.bytes_transferred += bytes_read; return bytes_read; } static ssize_t migration_write_data(vfu_ctx_t *vfu_ctx, void *data, uint64_t size) { struct server_data *server_data = vfu_get_private(vfu_ctx); char *buf = data; assert(server_data != NULL); assert(data != NULL); uint32_t total_to_write = server_data->bar1_size + sizeof(server_data->bar0); if (server_data->migration.bytes_transferred == total_to_write || size == 0) { return 0; } uint32_t write_start = server_data->migration.bytes_transferred; uint32_t write_end = MIN(write_start + size, total_to_write); // exclusive assert(write_end > write_start); uint32_t bytes_written = write_end - write_start; uint32_t length_in_bar1 = 0; uint32_t length_in_bar0 = 0; /* write to bar1, if any */ if (write_start < server_data->bar1_size) { length_in_bar1 = MIN(bytes_written, server_data->bar1_size - write_start); memcpy(server_data->bar1 + write_start, buf, length_in_bar1); write_start += length_in_bar1; } /* write to bar0, if any */ if (write_end > server_data->bar1_size) { length_in_bar0 = write_end - write_start; write_start -= server_data->bar1_size; memcpy((char *)&server_data->bar0 + write_start, buf + length_in_bar1, length_in_bar0); } server_data->migration.bytes_transferred += bytes_written; return bytes_written; } int main(int argc, char *argv[]) { char template[] = "/tmp/libvfio-user.XXXXXX"; int ret; bool verbose = false; int opt; struct sigaction act = {.sa_handler = _sa_handler}; const size_t bar1_size = 0x3000; struct server_data server_data = { .migration = { .state = VFU_MIGR_STATE_RUNNING } }; vfu_ctx_t *vfu_ctx; vfu_trans_t trans = VFU_TRANS_SOCK; int tmpfd; const vfu_migration_callbacks_t migr_callbacks = { .version = VFU_MIGR_CALLBACKS_VERS, .transition = &migration_device_state_transition, .read_data = &migration_read_data, .write_data = &migration_write_data }; while ((opt = getopt(argc, argv, "v")) != -1) { switch (opt) { case 'v': verbose = true; break; default: /* '?' */ errx(EXIT_FAILURE, "Usage: %s [-v] ", argv[0]); } } if (optind >= argc) { errx(EXIT_FAILURE, "missing vfio-user socket path"); } sigemptyset(&act.sa_mask); if (sigaction(SIGALRM, &act, NULL) == -1) { err(EXIT_FAILURE, "failed to register signal handler"); } if (strcmp(argv[optind], "pipe") == 0) { trans = VFU_TRANS_PIPE; } vfu_ctx = vfu_create_ctx(trans, argv[optind], 0, &server_data, VFU_DEV_TYPE_PCI); if (vfu_ctx == NULL) { err(EXIT_FAILURE, "failed to initialize device emulation"); } ret = vfu_setup_log(vfu_ctx, _log, verbose ? LOG_DEBUG : LOG_ERR); if (ret < 0) { err(EXIT_FAILURE, "failed to setup log"); } ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_CONVENTIONAL, PCI_HEADER_TYPE_NORMAL, 0); if (ret < 0) { err(EXIT_FAILURE, "vfu_pci_init() failed") ; } vfu_pci_set_id(vfu_ctx, 0xdead, 0xbeef, 0xcafe, 0xbabe); ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, sizeof(time_t), &bar0_access, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); if (ret < 0) { err(EXIT_FAILURE, "failed to setup BAR0 region"); } umask(0022); /* * Setup BAR1 to be 3 pages in size where only the first and the last pages * are mappable. The client can still mmap the 2nd page, we can't prohibit * this under Linux. If we really want to prohibit it we have to use * separate files for the same region. */ if ((tmpfd = mkstemp(template)) == -1) { err(EXIT_FAILURE, "failed to create backing file"); } unlink(template); server_data.bar1_size = bar1_size; if (ftruncate(tmpfd, server_data.bar1_size) == -1) { err(EXIT_FAILURE, "failed to truncate backing file"); } server_data.bar1 = mmap(NULL, server_data.bar1_size, PROT_READ | PROT_WRITE, MAP_SHARED, tmpfd, 0); if (server_data.bar1 == MAP_FAILED) { err(EXIT_FAILURE, "failed to mmap BAR1"); } struct iovec bar1_mmap_areas[] = { { .iov_base = (void*)0, .iov_len = 0x1000 }, { .iov_base = (void*)0x2000, .iov_len = 0x1000 } }; ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR1_REGION_IDX, server_data.bar1_size, &bar1_access, VFU_REGION_FLAG_RW, bar1_mmap_areas, 2, tmpfd, 0); if (ret < 0) { err(EXIT_FAILURE, "failed to setup BAR1 region"); } ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks); if (ret < 0) { err(EXIT_FAILURE, "failed to setup device migration"); } ret = vfu_setup_device_reset_cb(vfu_ctx, &device_reset); if (ret < 0) { err(EXIT_FAILURE, "failed to setup device reset callbacks"); } ret = vfu_setup_device_dma(vfu_ctx, &dma_register, &dma_unregister); if (ret < 0) { err(EXIT_FAILURE, "failed to setup device DMA callbacks"); } ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); if (ret < 0) { err(EXIT_FAILURE, "failed to setup irq counts"); } ret = vfu_realize_ctx(vfu_ctx); if (ret < 0) { err(EXIT_FAILURE, "failed to realize device"); } ret = vfu_attach_ctx(vfu_ctx); if (ret < 0) { err(EXIT_FAILURE, "failed to attach device"); } do { ret = vfu_run_ctx(vfu_ctx); if (ret == -1 && errno == EINTR) { if (irq_triggered) { irq_triggered = false; ret = vfu_irq_trigger(vfu_ctx, 0); if (ret < 0) { err(EXIT_FAILURE, "vfu_irq_trigger() failed"); } printf("doing dma io\n"); /* * We initiate some dummy DMA by directly accessing the client's * memory. In this case, we keep track of dirty pages ourselves, * as the client has no knowledge of what and when we have * written to its memory. */ do_dma_io(vfu_ctx, &server_data, 1, false); /* * We also do some dummy DMA via explicit messages to show how * DMA is done if the client's RAM isn't mappable or the server * implementation prefers it this way. In this case, the client * is responsible for tracking pages that are dirtied, as it is * the one actually performing the writes. */ do_dma_io(vfu_ctx, &server_data, 0, true); ret = 0; } } } while (ret == 0); if (ret == -1 && errno != ENOTCONN && errno != EINTR && errno != ESHUTDOWN) { errx(EXIT_FAILURE, "failed to realize device emulation"); } vfu_destroy_ctx(vfu_ctx); return EXIT_SUCCESS; } /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */