diff options
40 files changed, 2387 insertions, 584 deletions
diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst index 4491c4c..1167f3a 100644 --- a/docs/system/device-emulation.rst +++ b/docs/system/device-emulation.rst @@ -91,6 +91,7 @@ Emulated Devices devices/nvme.rst devices/usb.rst devices/vhost-user.rst + devices/virtio-gpu.rst devices/virtio-pmem.rst devices/vhost-user-rng.rst devices/canokey.rst diff --git a/docs/system/devices/virtio-gpu.rst b/docs/system/devices/virtio-gpu.rst new file mode 100644 index 0000000..cb73dd7 --- /dev/null +++ b/docs/system/devices/virtio-gpu.rst @@ -0,0 +1,112 @@ +.. + SPDX-License-Identifier: GPL-2.0-or-later + +virtio-gpu +========== + +This document explains the setup and usage of the virtio-gpu device. +The virtio-gpu device paravirtualizes the GPU and display controller. + +Linux kernel support +-------------------- + +virtio-gpu requires a guest Linux kernel built with the +``CONFIG_DRM_VIRTIO_GPU`` option. + +QEMU virtio-gpu variants +------------------------ + +QEMU virtio-gpu device variants come in the following form: + + * ``virtio-vga[-BACKEND]`` + * ``virtio-gpu[-BACKEND][-INTERFACE]`` + * ``vhost-user-vga`` + * ``vhost-user-pci`` + +**Backends:** QEMU provides a 2D virtio-gpu backend, and two accelerated +backends: virglrenderer ('gl' device label) and rutabaga_gfx ('rutabaga' +device label). There is a vhost-user backend that runs the graphics stack +in a separate process for improved isolation. + +**Interfaces:** QEMU further categorizes virtio-gpu device variants based +on the interface exposed to the guest. The interfaces can be classified +into VGA and non-VGA variants. The VGA ones are prefixed with virtio-vga +or vhost-user-vga while the non-VGA ones are prefixed with virtio-gpu or +vhost-user-gpu. + +The VGA ones always use the PCI interface, but for the non-VGA ones, the +user can further pick between MMIO or PCI. For MMIO, the user can suffix +the device name with -device, though vhost-user-gpu does not support MMIO. +For PCI, the user can suffix it with -pci. Without these suffixes, the +platform default will be chosen. + +virtio-gpu 2d +------------- + +The default 2D backend only performs 2D operations. The guest needs to +employ a software renderer for 3D graphics. + +Typically, the software renderer is provided by `Mesa`_ or `SwiftShader`_. +Mesa's implementations (LLVMpipe, Lavapipe and virgl below) work out of box +on typical modern Linux distributions. + +.. parsed-literal:: + -device virtio-gpu + +.. _Mesa: https://www.mesa3d.org/ +.. _SwiftShader: https://github.com/google/swiftshader + +virtio-gpu virglrenderer +------------------------ + +When using virgl accelerated graphics mode in the guest, OpenGL API calls +are translated into an intermediate representation (see `Gallium3D`_). The +intermediate representation is communicated to the host and the +`virglrenderer`_ library on the host translates the intermediate +representation back to OpenGL API calls. + +.. parsed-literal:: + -device virtio-gpu-gl + +.. _Gallium3D: https://www.freedesktop.org/wiki/Software/gallium/ +.. _virglrenderer: https://gitlab.freedesktop.org/virgl/virglrenderer/ + +virtio-gpu rutabaga +------------------- + +virtio-gpu can also leverage rutabaga_gfx to provide `gfxstream`_ +rendering and `Wayland display passthrough`_. With the gfxstream rendering +mode, GLES and Vulkan calls are forwarded to the host with minimal +modification. + +The crosvm book provides directions on how to build a `gfxstream-enabled +rutabaga`_ and launch a `guest Wayland proxy`_. + +This device does require host blob support (``hostmem`` field below). The +``hostmem`` field specifies the size of virtio-gpu host memory window. +This is typically between 256M and 8G. + +At least one virtio-gpu capability set ("capset") must be specified when +starting the device. The currently capsets supported are ``gfxstream-vulkan`` +and ``cross-domain`` for Linux guests. For Android guests, the experimental +``x-gfxstream-gles`` and ``x-gfxstream-composer`` capsets are also supported. + +The device will try to auto-detect the wayland socket path if the +``cross-domain`` capset name is set. The user may optionally specify +``wayland-socket-path`` for non-standard paths. + +The ``wsi`` option can be set to ``surfaceless`` or ``headless``. +Surfaceless doesn't create a native window surface, but does copy from the +render target to the Pixman buffer if a virtio-gpu 2D hypercall is issued. +Headless is like surfaceless, but doesn't copy to the Pixman buffer. +Surfaceless is the default if ``wsi`` is not specified. + +.. parsed-literal:: + -device virtio-gpu-rutabaga,gfxstream-vulkan=on,cross-domain=on, + hostmem=8G,wayland-socket-path=/tmp/nonstandard/mock_wayland.sock, + wsi=headless + +.. _gfxstream: https://android.googlesource.com/platform/hardware/google/gfxstream/ +.. _Wayland display passthrough: https://www.youtube.com/watch?v=OZJiHMtIQ2M +.. _gfxstream-enabled rutabaga: https://crosvm.dev/book/appendix/rutabaga_gfx.html +.. _guest Wayland proxy: https://crosvm.dev/book/devices/wayland.html diff --git a/hw/display/meson.build b/hw/display/meson.build index 05619c6..2b64fd9 100644 --- a/hw/display/meson.build +++ b/hw/display/meson.build @@ -80,6 +80,13 @@ if config_all_devices.has_key('CONFIG_VIRTIO_GPU') if_true: [files('virtio-gpu-gl.c', 'virtio-gpu-virgl.c'), pixman, virgl]) hw_display_modules += {'virtio-gpu-gl': virtio_gpu_gl_ss} endif + + if rutabaga.found() + virtio_gpu_rutabaga_ss = ss.source_set() + virtio_gpu_rutabaga_ss.add(when: ['CONFIG_VIRTIO_GPU', rutabaga], + if_true: [files('virtio-gpu-rutabaga.c'), pixman]) + hw_display_modules += {'virtio-gpu-rutabaga': virtio_gpu_rutabaga_ss} + endif endif if config_all_devices.has_key('CONFIG_VIRTIO_PCI') @@ -96,6 +103,12 @@ if config_all_devices.has_key('CONFIG_VIRTIO_PCI') if_true: [files('virtio-gpu-pci-gl.c'), pixman]) hw_display_modules += {'virtio-gpu-pci-gl': virtio_gpu_pci_gl_ss} endif + if rutabaga.found() + virtio_gpu_pci_rutabaga_ss = ss.source_set() + virtio_gpu_pci_rutabaga_ss.add(when: ['CONFIG_VIRTIO_GPU', 'CONFIG_VIRTIO_PCI', rutabaga], + if_true: [files('virtio-gpu-pci-rutabaga.c'), pixman]) + hw_display_modules += {'virtio-gpu-pci-rutabaga': virtio_gpu_pci_rutabaga_ss} + endif endif if config_all_devices.has_key('CONFIG_VIRTIO_VGA') @@ -114,6 +127,15 @@ if config_all_devices.has_key('CONFIG_VIRTIO_VGA') virtio_vga_gl_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-vga.c'), if_false: files('acpi-vga-stub.c')) hw_display_modules += {'virtio-vga-gl': virtio_vga_gl_ss} + + if rutabaga.found() + virtio_vga_rutabaga_ss = ss.source_set() + virtio_vga_rutabaga_ss.add(when: ['CONFIG_VIRTIO_VGA', rutabaga], + if_true: [files('virtio-vga-rutabaga.c'), pixman]) + virtio_vga_rutabaga_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-vga.c'), + if_false: files('acpi-vga-stub.c')) + hw_display_modules += {'virtio-vga-rutabaga': virtio_vga_rutabaga_ss} + endif endif system_ss.add(when: 'CONFIG_OMAP', if_true: files('omap_lcdc.c')) diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c index ca1fb7b..50c5373 100644 --- a/hw/display/virtio-gpu-base.c +++ b/hw/display/virtio-gpu-base.c @@ -223,7 +223,8 @@ virtio_gpu_base_get_features(VirtIODevice *vdev, uint64_t features, { VirtIOGPUBase *g = VIRTIO_GPU_BASE(vdev); - if (virtio_gpu_virgl_enabled(g->conf)) { + if (virtio_gpu_virgl_enabled(g->conf) || + virtio_gpu_rutabaga_enabled(g->conf)) { features |= (1 << VIRTIO_GPU_F_VIRGL); } if (virtio_gpu_edid_enabled(g->conf)) { @@ -232,6 +233,9 @@ virtio_gpu_base_get_features(VirtIODevice *vdev, uint64_t features, if (virtio_gpu_blob_enabled(g->conf)) { features |= (1 << VIRTIO_GPU_F_RESOURCE_BLOB); } + if (virtio_gpu_context_init_enabled(g->conf)) { + features |= (1 << VIRTIO_GPU_F_CONTEXT_INIT); + } return features; } diff --git a/hw/display/virtio-gpu-pci-rutabaga.c b/hw/display/virtio-gpu-pci-rutabaga.c new file mode 100644 index 0000000..c96729e --- /dev/null +++ b/hw/display/virtio-gpu-pci-rutabaga.c @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "hw/pci/pci.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio-gpu-pci.h" +#include "qom/object.h" + +#define TYPE_VIRTIO_GPU_RUTABAGA_PCI "virtio-gpu-rutabaga-pci" +OBJECT_DECLARE_SIMPLE_TYPE(VirtIOGPURutabagaPCI, VIRTIO_GPU_RUTABAGA_PCI) + +struct VirtIOGPURutabagaPCI { + VirtIOGPUPCIBase parent_obj; + + VirtIOGPURutabaga vdev; +}; + +static void virtio_gpu_rutabaga_initfn(Object *obj) +{ + VirtIOGPURutabagaPCI *dev = VIRTIO_GPU_RUTABAGA_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_GPU_RUTABAGA); + VIRTIO_GPU_PCI_BASE(obj)->vgpu = VIRTIO_GPU_BASE(&dev->vdev); +} + +static const TypeInfo virtio_gpu_rutabaga_pci_info[] = { + { + .name = TYPE_VIRTIO_GPU_RUTABAGA_PCI, + .parent = TYPE_VIRTIO_GPU_PCI_BASE, + .instance_size = sizeof(VirtIOGPURutabagaPCI), + .instance_init = virtio_gpu_rutabaga_initfn, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + } + }, +}; + +DEFINE_TYPES(virtio_gpu_rutabaga_pci_info) + +module_obj(TYPE_VIRTIO_GPU_RUTABAGA_PCI); +module_kconfig(VIRTIO_PCI); +module_dep("hw-display-virtio-gpu-pci"); diff --git a/hw/display/virtio-gpu-pci.c b/hw/display/virtio-gpu-pci.c index 93f214f..da6a99f 100644 --- a/hw/display/virtio-gpu-pci.c +++ b/hw/display/virtio-gpu-pci.c @@ -33,6 +33,20 @@ static void virtio_gpu_pci_base_realize(VirtIOPCIProxy *vpci_dev, Error **errp) DeviceState *vdev = DEVICE(g); int i; + if (virtio_gpu_hostmem_enabled(g->conf)) { + vpci_dev->msix_bar_idx = 1; + vpci_dev->modern_mem_bar_idx = 2; + memory_region_init(&g->hostmem, OBJECT(g), "virtio-gpu-hostmem", + g->conf.hostmem); + pci_register_bar(&vpci_dev->pci_dev, 4, + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_PREFETCH | + PCI_BASE_ADDRESS_MEM_TYPE_64, + &g->hostmem); + virtio_pci_add_shm_cap(vpci_dev, 4, 0, g->conf.hostmem, + VIRTIO_GPU_SHM_ID_HOST_VISIBLE); + } + virtio_pci_force_virtio_1(vpci_dev); if (!qdev_realize(vdev, BUS(&vpci_dev->bus), errp)) { return; diff --git a/hw/display/virtio-gpu-rutabaga.c b/hw/display/virtio-gpu-rutabaga.c new file mode 100644 index 0000000..9e67f9b --- /dev/null +++ b/hw/display/virtio-gpu-rutabaga.c @@ -0,0 +1,1120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/iov.h" +#include "trace.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-gpu.h" +#include "hw/virtio/virtio-gpu-pixman.h" +#include "hw/virtio/virtio-iommu.h" + +#include <glib/gmem.h> +#include <rutabaga_gfx/rutabaga_gfx_ffi.h> + +#define CHECK(condition, cmd) \ + do { \ + if (!(condition)) { \ + error_report("CHECK failed in %s() %s:" "%d", __func__, \ + __FILE__, __LINE__); \ + (cmd)->error = VIRTIO_GPU_RESP_ERR_UNSPEC; \ + return; \ + } \ + } while (0) + +struct rutabaga_aio_data { + struct VirtIOGPURutabaga *vr; + struct rutabaga_fence fence; +}; + +static void +virtio_gpu_rutabaga_update_cursor(VirtIOGPU *g, struct virtio_gpu_scanout *s, + uint32_t resource_id) +{ + struct virtio_gpu_simple_resource *res; + struct rutabaga_transfer transfer = { 0 }; + struct iovec transfer_iovec; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + res = virtio_gpu_find_resource(g, resource_id); + if (!res) { + return; + } + + if (res->width != s->current_cursor->width || + res->height != s->current_cursor->height) { + return; + } + + transfer.x = 0; + transfer.y = 0; + transfer.z = 0; + transfer.w = res->width; + transfer.h = res->height; + transfer.d = 1; + + transfer_iovec.iov_base = s->current_cursor->data; + transfer_iovec.iov_len = res->width * res->height * 4; + + rutabaga_resource_transfer_read(vr->rutabaga, 0, + resource_id, &transfer, + &transfer_iovec); +} + +static void +virtio_gpu_rutabaga_gl_flushed(VirtIOGPUBase *b) +{ + VirtIOGPU *g = VIRTIO_GPU(b); + virtio_gpu_process_cmdq(g); +} + +static void +rutabaga_cmd_create_resource_2d(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct rutabaga_create_3d rc_3d = { 0 }; + struct virtio_gpu_simple_resource *res; + struct virtio_gpu_resource_create_2d c2d; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(c2d); + trace_virtio_gpu_cmd_res_create_2d(c2d.resource_id, c2d.format, + c2d.width, c2d.height); + + rc_3d.target = 2; + rc_3d.format = c2d.format; + rc_3d.bind = (1 << 1); + rc_3d.width = c2d.width; + rc_3d.height = c2d.height; + rc_3d.depth = 1; + rc_3d.array_size = 1; + rc_3d.last_level = 0; + rc_3d.nr_samples = 0; + rc_3d.flags = VIRTIO_GPU_RESOURCE_FLAG_Y_0_TOP; + + result = rutabaga_resource_create_3d(vr->rutabaga, c2d.resource_id, &rc_3d); + CHECK(!result, cmd); + + res = g_new0(struct virtio_gpu_simple_resource, 1); + res->width = c2d.width; + res->height = c2d.height; + res->format = c2d.format; + res->resource_id = c2d.resource_id; + + QTAILQ_INSERT_HEAD(&g->reslist, res, next); +} + +static void +rutabaga_cmd_create_resource_3d(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct rutabaga_create_3d rc_3d = { 0 }; + struct virtio_gpu_simple_resource *res; + struct virtio_gpu_resource_create_3d c3d; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(c3d); + + trace_virtio_gpu_cmd_res_create_3d(c3d.resource_id, c3d.format, + c3d.width, c3d.height, c3d.depth); + + rc_3d.target = c3d.target; + rc_3d.format = c3d.format; + rc_3d.bind = c3d.bind; + rc_3d.width = c3d.width; + rc_3d.height = c3d.height; + rc_3d.depth = c3d.depth; + rc_3d.array_size = c3d.array_size; + rc_3d.last_level = c3d.last_level; + rc_3d.nr_samples = c3d.nr_samples; + rc_3d.flags = c3d.flags; + + result = rutabaga_resource_create_3d(vr->rutabaga, c3d.resource_id, &rc_3d); + CHECK(!result, cmd); + + res = g_new0(struct virtio_gpu_simple_resource, 1); + res->width = c3d.width; + res->height = c3d.height; + res->format = c3d.format; + res->resource_id = c3d.resource_id; + + QTAILQ_INSERT_HEAD(&g->reslist, res, next); +} + +static void +rutabaga_cmd_resource_unref(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct virtio_gpu_simple_resource *res; + struct virtio_gpu_resource_unref unref; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(unref); + + trace_virtio_gpu_cmd_res_unref(unref.resource_id); + + res = virtio_gpu_find_resource(g, unref.resource_id); + CHECK(res, cmd); + + result = rutabaga_resource_unref(vr->rutabaga, unref.resource_id); + CHECK(!result, cmd); + + if (res->image) { + pixman_image_unref(res->image); + } + + QTAILQ_REMOVE(&g->reslist, res, next); + g_free(res); +} + +static void +rutabaga_cmd_context_create(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct virtio_gpu_ctx_create cc; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(cc); + trace_virtio_gpu_cmd_ctx_create(cc.hdr.ctx_id, + cc.debug_name); + + result = rutabaga_context_create(vr->rutabaga, cc.hdr.ctx_id, + cc.context_init, cc.debug_name, cc.nlen); + CHECK(!result, cmd); +} + +static void +rutabaga_cmd_context_destroy(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct virtio_gpu_ctx_destroy cd; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(cd); + trace_virtio_gpu_cmd_ctx_destroy(cd.hdr.ctx_id); + + result = rutabaga_context_destroy(vr->rutabaga, cd.hdr.ctx_id); + CHECK(!result, cmd); +} + +static void +rutabaga_cmd_resource_flush(VirtIOGPU *g, struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result, i; + struct virtio_gpu_scanout *scanout = NULL; + struct virtio_gpu_simple_resource *res; + struct rutabaga_transfer transfer = { 0 }; + struct iovec transfer_iovec; + struct virtio_gpu_resource_flush rf; + bool found = false; + + VirtIOGPUBase *vb = VIRTIO_GPU_BASE(g); + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + if (vr->headless) { + return; + } + + VIRTIO_GPU_FILL_CMD(rf); + trace_virtio_gpu_cmd_res_flush(rf.resource_id, + rf.r.width, rf.r.height, rf.r.x, rf.r.y); + + res = virtio_gpu_find_resource(g, rf.resource_id); + CHECK(res, cmd); + + for (i = 0; i < vb->conf.max_outputs; i++) { + scanout = &vb->scanout[i]; + if (i == res->scanout_bitmask) { + found = true; + break; + } + } + + if (!found) { + return; + } + + transfer.x = 0; + transfer.y = 0; + transfer.z = 0; + transfer.w = res->width; + transfer.h = res->height; + transfer.d = 1; + + transfer_iovec.iov_base = pixman_image_get_data(res->image); + transfer_iovec.iov_len = res->width * res->height * 4; + + result = rutabaga_resource_transfer_read(vr->rutabaga, 0, + rf.resource_id, &transfer, + &transfer_iovec); + CHECK(!result, cmd); + dpy_gfx_update_full(scanout->con); +} + +static void +rutabaga_cmd_set_scanout(VirtIOGPU *g, struct virtio_gpu_ctrl_command *cmd) +{ + struct virtio_gpu_simple_resource *res; + struct virtio_gpu_scanout *scanout = NULL; + struct virtio_gpu_set_scanout ss; + + VirtIOGPUBase *vb = VIRTIO_GPU_BASE(g); + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + if (vr->headless) { + return; + } + + VIRTIO_GPU_FILL_CMD(ss); + trace_virtio_gpu_cmd_set_scanout(ss.scanout_id, ss.resource_id, + ss.r.width, ss.r.height, ss.r.x, ss.r.y); + + CHECK(ss.scanout_id < VIRTIO_GPU_MAX_SCANOUTS, cmd); + scanout = &vb->scanout[ss.scanout_id]; + + if (ss.resource_id == 0) { + dpy_gfx_replace_surface(scanout->con, NULL); + dpy_gl_scanout_disable(scanout->con); + return; + } + + res = virtio_gpu_find_resource(g, ss.resource_id); + CHECK(res, cmd); + + if (!res->image) { + pixman_format_code_t pformat; + pformat = virtio_gpu_get_pixman_format(res->format); + CHECK(pformat, cmd); + + res->image = pixman_image_create_bits(pformat, + res->width, + res->height, + NULL, 0); + CHECK(res->image, cmd); + pixman_image_ref(res->image); + } + + vb->enable = 1; + + /* realloc the surface ptr */ + scanout->ds = qemu_create_displaysurface_pixman(res->image); + dpy_gfx_replace_surface(scanout->con, NULL); + dpy_gfx_replace_surface(scanout->con, scanout->ds); + res->scanout_bitmask = ss.scanout_id; +} + +static void +rutabaga_cmd_submit_3d(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct virtio_gpu_cmd_submit cs; + struct rutabaga_command rutabaga_cmd = { 0 }; + g_autofree uint8_t *buf = NULL; + size_t s; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(cs); + trace_virtio_gpu_cmd_ctx_submit(cs.hdr.ctx_id, cs.size); + + buf = g_new0(uint8_t, cs.size); + s = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, + sizeof(cs), buf, cs.size); + CHECK(s == cs.size, cmd); + + rutabaga_cmd.ctx_id = cs.hdr.ctx_id; + rutabaga_cmd.cmd = buf; + rutabaga_cmd.cmd_size = cs.size; + + result = rutabaga_submit_command(vr->rutabaga, &rutabaga_cmd); + CHECK(!result, cmd); +} + +static void +rutabaga_cmd_transfer_to_host_2d(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct rutabaga_transfer transfer = { 0 }; + struct virtio_gpu_transfer_to_host_2d t2d; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(t2d); + trace_virtio_gpu_cmd_res_xfer_toh_2d(t2d.resource_id); + + transfer.x = t2d.r.x; + transfer.y = t2d.r.y; + transfer.z = 0; + transfer.w = t2d.r.width; + transfer.h = t2d.r.height; + transfer.d = 1; + + result = rutabaga_resource_transfer_write(vr->rutabaga, 0, t2d.resource_id, + &transfer); + CHECK(!result, cmd); +} + +static void +rutabaga_cmd_transfer_to_host_3d(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct rutabaga_transfer transfer = { 0 }; + struct virtio_gpu_transfer_host_3d t3d; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(t3d); + trace_virtio_gpu_cmd_res_xfer_toh_3d(t3d.resource_id); + + transfer.x = t3d.box.x; + transfer.y = t3d.box.y; + transfer.z = t3d.box.z; + transfer.w = t3d.box.w; + transfer.h = t3d.box.h; + transfer.d = t3d.box.d; + transfer.level = t3d.level; + transfer.stride = t3d.stride; + transfer.layer_stride = t3d.layer_stride; + transfer.offset = t3d.offset; + + result = rutabaga_resource_transfer_write(vr->rutabaga, t3d.hdr.ctx_id, + t3d.resource_id, &transfer); + CHECK(!result, cmd); +} + +static void +rutabaga_cmd_transfer_from_host_3d(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct rutabaga_transfer transfer = { 0 }; + struct virtio_gpu_transfer_host_3d t3d; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(t3d); + trace_virtio_gpu_cmd_res_xfer_fromh_3d(t3d.resource_id); + + transfer.x = t3d.box.x; + transfer.y = t3d.box.y; + transfer.z = t3d.box.z; + transfer.w = t3d.box.w; + transfer.h = t3d.box.h; + transfer.d = t3d.box.d; + transfer.level = t3d.level; + transfer.stride = t3d.stride; + transfer.layer_stride = t3d.layer_stride; + transfer.offset = t3d.offset; + + result = rutabaga_resource_transfer_read(vr->rutabaga, t3d.hdr.ctx_id, + t3d.resource_id, &transfer, NULL); + CHECK(!result, cmd); +} + +static void +rutabaga_cmd_attach_backing(VirtIOGPU *g, struct virtio_gpu_ctrl_command *cmd) +{ + struct rutabaga_iovecs vecs = { 0 }; + struct virtio_gpu_simple_resource *res; + struct virtio_gpu_resource_attach_backing att_rb; + int ret; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(att_rb); + trace_virtio_gpu_cmd_res_back_attach(att_rb.resource_id); + + res = virtio_gpu_find_resource(g, att_rb.resource_id); + CHECK(res, cmd); + CHECK(!res->iov, cmd); + + ret = virtio_gpu_create_mapping_iov(g, att_rb.nr_entries, sizeof(att_rb), + cmd, NULL, &res->iov, &res->iov_cnt); + CHECK(!ret, cmd); + + vecs.iovecs = res->iov; + vecs.num_iovecs = res->iov_cnt; + + ret = rutabaga_resource_attach_backing(vr->rutabaga, att_rb.resource_id, + &vecs); + if (ret != 0) { + virtio_gpu_cleanup_mapping(g, res); + } + + CHECK(!ret, cmd); +} + +static void +rutabaga_cmd_detach_backing(VirtIOGPU *g, struct virtio_gpu_ctrl_command *cmd) +{ + struct virtio_gpu_simple_resource *res; + struct virtio_gpu_resource_detach_backing detach_rb; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(detach_rb); + trace_virtio_gpu_cmd_res_back_detach(detach_rb.resource_id); + + res = virtio_gpu_find_resource(g, detach_rb.resource_id); + CHECK(res, cmd); + + rutabaga_resource_detach_backing(vr->rutabaga, + detach_rb.resource_id); + + virtio_gpu_cleanup_mapping(g, res); +} + +static void +rutabaga_cmd_ctx_attach_resource(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct virtio_gpu_ctx_resource att_res; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(att_res); + trace_virtio_gpu_cmd_ctx_res_attach(att_res.hdr.ctx_id, + att_res.resource_id); + + result = rutabaga_context_attach_resource(vr->rutabaga, att_res.hdr.ctx_id, + att_res.resource_id); + CHECK(!result, cmd); +} + +static void +rutabaga_cmd_ctx_detach_resource(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct virtio_gpu_ctx_resource det_res; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(det_res); + trace_virtio_gpu_cmd_ctx_res_detach(det_res.hdr.ctx_id, + det_res.resource_id); + + result = rutabaga_context_detach_resource(vr->rutabaga, det_res.hdr.ctx_id, + det_res.resource_id); + CHECK(!result, cmd); +} + +static void +rutabaga_cmd_get_capset_info(VirtIOGPU *g, struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct virtio_gpu_get_capset_info info; + struct virtio_gpu_resp_capset_info resp; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(info); + + result = rutabaga_get_capset_info(vr->rutabaga, info.capset_index, + &resp.capset_id, &resp.capset_max_version, + &resp.capset_max_size); + CHECK(!result, cmd); + + resp.hdr.type = VIRTIO_GPU_RESP_OK_CAPSET_INFO; + virtio_gpu_ctrl_response(g, cmd, &resp.hdr, sizeof(resp)); +} + +static void +rutabaga_cmd_get_capset(VirtIOGPU *g, struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + struct virtio_gpu_get_capset gc; + struct virtio_gpu_resp_capset *resp; + uint32_t capset_size, capset_version; + uint32_t current_id, i; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(gc); + for (i = 0; i < vr->num_capsets; i++) { + result = rutabaga_get_capset_info(vr->rutabaga, i, + ¤t_id, &capset_version, + &capset_size); + CHECK(!result, cmd); + + if (current_id == gc.capset_id) { + break; + } + } + + CHECK(i < vr->num_capsets, cmd); + + resp = g_malloc0(sizeof(*resp) + capset_size); + resp->hdr.type = VIRTIO_GPU_RESP_OK_CAPSET; + rutabaga_get_capset(vr->rutabaga, gc.capset_id, gc.capset_version, + resp->capset_data, capset_size); + + virtio_gpu_ctrl_response(g, cmd, &resp->hdr, sizeof(*resp) + capset_size); + g_free(resp); +} + +static void +rutabaga_cmd_resource_create_blob(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int result; + struct rutabaga_iovecs vecs = { 0 }; + g_autofree struct virtio_gpu_simple_resource *res = NULL; + struct virtio_gpu_resource_create_blob cblob; + struct rutabaga_create_blob rc_blob = { 0 }; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(cblob); + trace_virtio_gpu_cmd_res_create_blob(cblob.resource_id, cblob.size); + + CHECK(cblob.resource_id != 0, cmd); + + res = g_new0(struct virtio_gpu_simple_resource, 1); + + res->resource_id = cblob.resource_id; + res->blob_size = cblob.size; + + if (cblob.blob_mem != VIRTIO_GPU_BLOB_MEM_HOST3D) { + result = virtio_gpu_create_mapping_iov(g, cblob.nr_entries, + sizeof(cblob), cmd, &res->addrs, + &res->iov, &res->iov_cnt); + CHECK(!result, cmd); + } + + rc_blob.blob_id = cblob.blob_id; + rc_blob.blob_mem = cblob.blob_mem; + rc_blob.blob_flags = cblob.blob_flags; + rc_blob.size = cblob.size; + + vecs.iovecs = res->iov; + vecs.num_iovecs = res->iov_cnt; + + result = rutabaga_resource_create_blob(vr->rutabaga, cblob.hdr.ctx_id, + cblob.resource_id, &rc_blob, &vecs, + NULL); + + if (result && cblob.blob_mem != VIRTIO_GPU_BLOB_MEM_HOST3D) { + virtio_gpu_cleanup_mapping(g, res); + } + + CHECK(!result, cmd); + + QTAILQ_INSERT_HEAD(&g->reslist, res, next); + res = NULL; +} + +static void +rutabaga_cmd_resource_map_blob(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + uint32_t map_info = 0; + uint32_t slot = 0; + struct virtio_gpu_simple_resource *res; + struct rutabaga_mapping mapping = { 0 }; + struct virtio_gpu_resource_map_blob mblob; + struct virtio_gpu_resp_map_info resp = { 0 }; + + VirtIOGPUBase *vb = VIRTIO_GPU_BASE(g); + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(mblob); + + CHECK(mblob.resource_id != 0, cmd); + + res = virtio_gpu_find_resource(g, mblob.resource_id); + CHECK(res, cmd); + + result = rutabaga_resource_map_info(vr->rutabaga, mblob.resource_id, + &map_info); + CHECK(!result, cmd); + + /* + * RUTABAGA_MAP_ACCESS_* flags are not part of the virtio-gpu spec, but do + * exist to potentially allow the hypervisor to restrict write access to + * memory. QEMU does not need to use this functionality at the moment. + */ + resp.map_info = map_info & RUTABAGA_MAP_CACHE_MASK; + + result = rutabaga_resource_map(vr->rutabaga, mblob.resource_id, &mapping); + CHECK(!result, cmd); + + /* + * There is small risk of the MemoryRegion dereferencing the pointer after + * rutabaga unmaps it. Please see discussion here: + * + * https://lists.gnu.org/archive/html/qemu-devel/2023-09/msg05141.html + * + * It is highly unlikely to happen in practice and doesn't affect known + * use cases. However, it should be fixed and is noted here for posterity. + */ + for (slot = 0; slot < MAX_SLOTS; slot++) { + if (vr->memory_regions[slot].used) { + continue; + } + + MemoryRegion *mr = &(vr->memory_regions[slot].mr); + memory_region_init_ram_ptr(mr, OBJECT(vr), "blob", mapping.size, + mapping.ptr); + memory_region_add_subregion(&vb->hostmem, mblob.offset, mr); + vr->memory_regions[slot].resource_id = mblob.resource_id; + vr->memory_regions[slot].used = 1; + break; + } + + if (slot >= MAX_SLOTS) { + result = rutabaga_resource_unmap(vr->rutabaga, mblob.resource_id); + CHECK(!result, cmd); + } + + CHECK(slot < MAX_SLOTS, cmd); + + resp.hdr.type = VIRTIO_GPU_RESP_OK_MAP_INFO; + virtio_gpu_ctrl_response(g, cmd, &resp.hdr, sizeof(resp)); +} + +static void +rutabaga_cmd_resource_unmap_blob(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + int32_t result; + uint32_t slot = 0; + struct virtio_gpu_simple_resource *res; + struct virtio_gpu_resource_unmap_blob ublob; + + VirtIOGPUBase *vb = VIRTIO_GPU_BASE(g); + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(ublob); + + CHECK(ublob.resource_id != 0, cmd); + + res = virtio_gpu_find_resource(g, ublob.resource_id); + CHECK(res, cmd); + + for (slot = 0; slot < MAX_SLOTS; slot++) { + if (vr->memory_regions[slot].resource_id != ublob.resource_id) { + continue; + } + + MemoryRegion *mr = &(vr->memory_regions[slot].mr); + memory_region_del_subregion(&vb->hostmem, mr); + + vr->memory_regions[slot].resource_id = 0; + vr->memory_regions[slot].used = 0; + break; + } + + CHECK(slot < MAX_SLOTS, cmd); + result = rutabaga_resource_unmap(vr->rutabaga, res->resource_id); + CHECK(!result, cmd); +} + +static void +virtio_gpu_rutabaga_process_cmd(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ + struct rutabaga_fence fence = { 0 }; + int32_t result; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + VIRTIO_GPU_FILL_CMD(cmd->cmd_hdr); + + switch (cmd->cmd_hdr.type) { + case VIRTIO_GPU_CMD_CTX_CREATE: + rutabaga_cmd_context_create(g, cmd); + break; + case VIRTIO_GPU_CMD_CTX_DESTROY: + rutabaga_cmd_context_destroy(g, cmd); + break; + case VIRTIO_GPU_CMD_RESOURCE_CREATE_2D: + rutabaga_cmd_create_resource_2d(g, cmd); + break; + case VIRTIO_GPU_CMD_RESOURCE_CREATE_3D: + rutabaga_cmd_create_resource_3d(g, cmd); + break; + case VIRTIO_GPU_CMD_SUBMIT_3D: + rutabaga_cmd_submit_3d(g, cmd); + break; + case VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D: + rutabaga_cmd_transfer_to_host_2d(g, cmd); + break; + case VIRTIO_GPU_CMD_TRANSFER_TO_HOST_3D: + rutabaga_cmd_transfer_to_host_3d(g, cmd); + break; + case VIRTIO_GPU_CMD_TRANSFER_FROM_HOST_3D: + rutabaga_cmd_transfer_from_host_3d(g, cmd); + break; + case VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING: + rutabaga_cmd_attach_backing(g, cmd); + break; + case VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING: + rutabaga_cmd_detach_backing(g, cmd); + break; + case VIRTIO_GPU_CMD_SET_SCANOUT: + rutabaga_cmd_set_scanout(g, cmd); + break; + case VIRTIO_GPU_CMD_RESOURCE_FLUSH: + rutabaga_cmd_resource_flush(g, cmd); + break; + case VIRTIO_GPU_CMD_RESOURCE_UNREF: + rutabaga_cmd_resource_unref(g, cmd); + break; + case VIRTIO_GPU_CMD_CTX_ATTACH_RESOURCE: + rutabaga_cmd_ctx_attach_resource(g, cmd); + break; + case VIRTIO_GPU_CMD_CTX_DETACH_RESOURCE: + rutabaga_cmd_ctx_detach_resource(g, cmd); + break; + case VIRTIO_GPU_CMD_GET_CAPSET_INFO: + rutabaga_cmd_get_capset_info(g, cmd); + break; + case VIRTIO_GPU_CMD_GET_CAPSET: + rutabaga_cmd_get_capset(g, cmd); + break; + case VIRTIO_GPU_CMD_GET_DISPLAY_INFO: + virtio_gpu_get_display_info(g, cmd); + break; + case VIRTIO_GPU_CMD_GET_EDID: + virtio_gpu_get_edid(g, cmd); + break; + case VIRTIO_GPU_CMD_RESOURCE_CREATE_BLOB: + rutabaga_cmd_resource_create_blob(g, cmd); + break; + case VIRTIO_GPU_CMD_RESOURCE_MAP_BLOB: + rutabaga_cmd_resource_map_blob(g, cmd); + break; + case VIRTIO_GPU_CMD_RESOURCE_UNMAP_BLOB: + rutabaga_cmd_resource_unmap_blob(g, cmd); + break; + default: + cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; + break; + } + + if (cmd->finished) { + return; + } + if (cmd->error) { + error_report("%s: ctrl 0x%x, error 0x%x", __func__, + cmd->cmd_hdr.type, cmd->error); + virtio_gpu_ctrl_response_nodata(g, cmd, cmd->error); + return; + } + if (!(cmd->cmd_hdr.flags & VIRTIO_GPU_FLAG_FENCE)) { + virtio_gpu_ctrl_response_nodata(g, cmd, VIRTIO_GPU_RESP_OK_NODATA); + return; + } + + fence.flags = cmd->cmd_hdr.flags; + fence.ctx_id = cmd->cmd_hdr.ctx_id; + fence.fence_id = cmd->cmd_hdr.fence_id; + fence.ring_idx = cmd->cmd_hdr.ring_idx; + + trace_virtio_gpu_fence_ctrl(cmd->cmd_hdr.fence_id, cmd->cmd_hdr.type); + + result = rutabaga_create_fence(vr->rutabaga, &fence); + CHECK(!result, cmd); +} + +static void +virtio_gpu_rutabaga_aio_cb(void *opaque) +{ + struct rutabaga_aio_data *data = opaque; + VirtIOGPU *g = VIRTIO_GPU(data->vr); + struct rutabaga_fence fence_data = data->fence; + struct virtio_gpu_ctrl_command *cmd, *tmp; + + uint32_t signaled_ctx_specific = fence_data.flags & + RUTABAGA_FLAG_INFO_RING_IDX; + + QTAILQ_FOREACH_SAFE(cmd, &g->fenceq, next, tmp) { + /* + * Due to context specific timelines. + */ + uint32_t target_ctx_specific = cmd->cmd_hdr.flags & + RUTABAGA_FLAG_INFO_RING_IDX; + + if (signaled_ctx_specific != target_ctx_specific) { + continue; + } + + if (signaled_ctx_specific && + (cmd->cmd_hdr.ring_idx != fence_data.ring_idx)) { + continue; + } + + if (cmd->cmd_hdr.fence_id > fence_data.fence_id) { + continue; + } + + trace_virtio_gpu_fence_resp(cmd->cmd_hdr.fence_id); + virtio_gpu_ctrl_response_nodata(g, cmd, VIRTIO_GPU_RESP_OK_NODATA); + QTAILQ_REMOVE(&g->fenceq, cmd, next); + g_free(cmd); + } + + g_free(data); +} + +static void +virtio_gpu_rutabaga_fence_cb(uint64_t user_data, + const struct rutabaga_fence *fence) +{ + struct rutabaga_aio_data *data; + VirtIOGPU *g = (VirtIOGPU *)user_data; + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + /* + * gfxstream and both cross-domain (and even newer versions virglrenderer: + * see VIRGL_RENDERER_ASYNC_FENCE_CB) like to signal fence completion on + * threads ("callback threads") that are different from the thread that + * processes the command queue ("main thread"). + * + * crosvm and other virtio-gpu 1.1 implementations enable callback threads + * via locking. However, on QEMU a deadlock is observed if + * virtio_gpu_ctrl_response_nodata(..) [used in the fence callback] is used + * from a thread that is not the main thread. + * + * The reason is QEMU's internal locking is designed to work with QEMU + * threads (see rcu_register_thread()) and not generic C/C++/Rust threads. + * For now, we can workaround this by scheduling the return of the + * fence descriptors on the main thread. + */ + + data = g_new0(struct rutabaga_aio_data, 1); + data->vr = vr; + data->fence = *fence; + aio_bh_schedule_oneshot(qemu_get_aio_context(), + virtio_gpu_rutabaga_aio_cb, + data); +} + +static void +virtio_gpu_rutabaga_debug_cb(uint64_t user_data, + const struct rutabaga_debug *debug) +{ + switch (debug->debug_type) { + case RUTABAGA_DEBUG_ERROR: + error_report("%s", debug->message); + break; + case RUTABAGA_DEBUG_WARN: + warn_report("%s", debug->message); + break; + case RUTABAGA_DEBUG_INFO: + info_report("%s", debug->message); + break; + default: + error_report("unknown debug type: %u", debug->debug_type); + } +} + +static bool virtio_gpu_rutabaga_init(VirtIOGPU *g, Error **errp) +{ + int result; + struct rutabaga_builder builder = { 0 }; + struct rutabaga_channel channel = { 0 }; + struct rutabaga_channels channels = { 0 }; + + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + vr->rutabaga = NULL; + + builder.wsi = RUTABAGA_WSI_SURFACELESS; + /* + * Currently, if WSI is specified, the only valid strings are "surfaceless" + * or "headless". Surfaceless doesn't create a native window surface, but + * does copy from the render target to the Pixman buffer if a virtio-gpu + * 2D hypercall is issued. Surfacless is the default. + * + * Headless is like surfaceless, but doesn't copy to the Pixman buffer. The + * use case is automated testing environments where there is no need to view + * results. + * + * In the future, more performant virtio-gpu 2D UI integration may be added. + */ + if (vr->wsi) { + if (g_str_equal(vr->wsi, "surfaceless")) { + vr->headless = false; + } else if (g_str_equal(vr->wsi, "headless")) { + vr->headless = true; + } else { + error_setg(errp, "invalid wsi option selected"); + return false; + } + } + + builder.fence_cb = virtio_gpu_rutabaga_fence_cb; + builder.debug_cb = virtio_gpu_rutabaga_debug_cb; + builder.capset_mask = vr->capset_mask; + builder.user_data = (uint64_t)g; + + /* + * If the user doesn't specify the wayland socket path, we try to infer + * the socket via a process similar to the one used by libwayland. + * libwayland does the following: + * + * 1) If $WAYLAND_DISPLAY is set, attempt to connect to + * $XDG_RUNTIME_DIR/$WAYLAND_DISPLAY + * 2) Otherwise, attempt to connect to $XDG_RUNTIME_DIR/wayland-0 + * 3) Otherwise, don't pass a wayland socket to rutabaga. If a guest + * wayland proxy is launched, it will fail to work. + */ + channel.channel_type = RUTABAGA_CHANNEL_TYPE_WAYLAND; + g_autofree gchar *path = NULL; + if (!vr->wayland_socket_path) { + const gchar *runtime_dir = g_get_user_runtime_dir(); + const gchar *display = g_getenv("WAYLAND_DISPLAY"); + if (!display) { + display = "wayland-0"; + } + + if (runtime_dir) { + path = g_build_filename(runtime_dir, display, NULL); + channel.channel_name = path; + } + } else { + channel.channel_name = vr->wayland_socket_path; + } + + if ((builder.capset_mask & (1 << RUTABAGA_CAPSET_CROSS_DOMAIN))) { + if (channel.channel_name) { + channels.channels = &channel; + channels.num_channels = 1; + builder.channels = &channels; + } + } + + result = rutabaga_init(&builder, &vr->rutabaga); + if (result) { + error_setg_errno(errp, -result, "Failed to init rutabaga"); + return false; + } + + return true; +} + +static int virtio_gpu_rutabaga_get_num_capsets(VirtIOGPU *g) +{ + int result; + uint32_t num_capsets; + VirtIOGPURutabaga *vr = VIRTIO_GPU_RUTABAGA(g); + + result = rutabaga_get_num_capsets(vr->rutabaga, &num_capsets); + if (result) { + error_report("Failed to get capsets"); + return 0; + } + vr->num_capsets = num_capsets; + return num_capsets; +} + +static void virtio_gpu_rutabaga_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOGPU *g = VIRTIO_GPU(vdev); + struct virtio_gpu_ctrl_command *cmd; + + if (!virtio_queue_ready(vq)) { + return; + } + + cmd = virtqueue_pop(vq, sizeof(struct virtio_gpu_ctrl_command)); + while (cmd) { + cmd->vq = vq; + cmd->error = 0; + cmd->finished = false; + QTAILQ_INSERT_TAIL(&g->cmdq, cmd, next); + cmd = virtqueue_pop(vq, sizeof(struct virtio_gpu_ctrl_command)); + } + + virtio_gpu_process_cmdq(g); +} + +static void virtio_gpu_rutabaga_realize(DeviceState *qdev, Error **errp) +{ + int num_capsets; + VirtIOGPUBase *bdev = VIRTIO_GPU_BASE(qdev); + VirtIOGPU *gpudev = VIRTIO_GPU(qdev); + +#if HOST_BIG_ENDIAN + error_setg(errp, "rutabaga is not supported on bigendian platforms"); + return; +#endif + + if (!virtio_gpu_rutabaga_init(gpudev, errp)) { + return; + } + + num_capsets = virtio_gpu_rutabaga_get_num_capsets(gpudev); + if (!num_capsets) { + return; + } + + bdev->conf.flags |= (1 << VIRTIO_GPU_FLAG_RUTABAGA_ENABLED); + bdev->conf.flags |= (1 << VIRTIO_GPU_FLAG_BLOB_ENABLED); + bdev->conf.flags |= (1 << VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED); + + bdev->virtio_config.num_capsets = num_capsets; + virtio_gpu_device_realize(qdev, errp); +} + +static Property virtio_gpu_rutabaga_properties[] = { + DEFINE_PROP_BIT64("gfxstream-vulkan", VirtIOGPURutabaga, capset_mask, + RUTABAGA_CAPSET_GFXSTREAM_VULKAN, false), + DEFINE_PROP_BIT64("cross-domain", VirtIOGPURutabaga, capset_mask, + RUTABAGA_CAPSET_CROSS_DOMAIN, false), + DEFINE_PROP_BIT64("x-gfxstream-gles", VirtIOGPURutabaga, capset_mask, + RUTABAGA_CAPSET_GFXSTREAM_GLES, false), + DEFINE_PROP_BIT64("x-gfxstream-composer", VirtIOGPURutabaga, capset_mask, + RUTABAGA_CAPSET_GFXSTREAM_COMPOSER, false), + DEFINE_PROP_STRING("wayland-socket-path", VirtIOGPURutabaga, + wayland_socket_path), + DEFINE_PROP_STRING("wsi", VirtIOGPURutabaga, wsi), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_gpu_rutabaga_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + VirtIOGPUBaseClass *vbc = VIRTIO_GPU_BASE_CLASS(klass); + VirtIOGPUClass *vgc = VIRTIO_GPU_CLASS(klass); + + vbc->gl_flushed = virtio_gpu_rutabaga_gl_flushed; + vgc->handle_ctrl = virtio_gpu_rutabaga_handle_ctrl; + vgc->process_cmd = virtio_gpu_rutabaga_process_cmd; + vgc->update_cursor_data = virtio_gpu_rutabaga_update_cursor; + + vdc->realize = virtio_gpu_rutabaga_realize; + device_class_set_props(dc, virtio_gpu_rutabaga_properties); +} + +static const TypeInfo virtio_gpu_rutabaga_info[] = { + { + .name = TYPE_VIRTIO_GPU_RUTABAGA, + .parent = TYPE_VIRTIO_GPU, + .instance_size = sizeof(VirtIOGPURutabaga), + .class_init = virtio_gpu_rutabaga_class_init, + }, +}; + +DEFINE_TYPES(virtio_gpu_rutabaga_info) + +module_obj(TYPE_VIRTIO_GPU_RUTABAGA); +module_kconfig(VIRTIO_GPU); +module_dep("hw-display-virtio-gpu"); diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index 93857ad..6efd15b 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -33,15 +33,11 @@ #define VIRTIO_GPU_VM_VERSION 1 -static struct virtio_gpu_simple_resource* -virtio_gpu_find_resource(VirtIOGPU *g, uint32_t resource_id); static struct virtio_gpu_simple_resource * virtio_gpu_find_check_resource(VirtIOGPU *g, uint32_t resource_id, bool require_backing, const char *caller, uint32_t *error); -static void virtio_gpu_cleanup_mapping(VirtIOGPU *g, - struct virtio_gpu_simple_resource *res); static void virtio_gpu_reset_bh(void *opaque); void virtio_gpu_update_cursor_data(VirtIOGPU *g, @@ -116,7 +112,7 @@ static void update_cursor(VirtIOGPU *g, struct virtio_gpu_update_cursor *cursor) cursor->resource_id ? 1 : 0); } -static struct virtio_gpu_simple_resource * +struct virtio_gpu_simple_resource * virtio_gpu_find_resource(VirtIOGPU *g, uint32_t resource_id) { struct virtio_gpu_simple_resource *res; @@ -904,8 +900,8 @@ void virtio_gpu_cleanup_mapping_iov(VirtIOGPU *g, g_free(iov); } -static void virtio_gpu_cleanup_mapping(VirtIOGPU *g, - struct virtio_gpu_simple_resource *res) +void virtio_gpu_cleanup_mapping(VirtIOGPU *g, + struct virtio_gpu_simple_resource *res) { virtio_gpu_cleanup_mapping_iov(g, res->iov, res->iov_cnt); res->iov = NULL; @@ -1367,8 +1363,9 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp) VirtIOGPU *g = VIRTIO_GPU(qdev); if (virtio_gpu_blob_enabled(g->parent_obj.conf)) { - if (!virtio_gpu_have_udmabuf()) { - error_setg(errp, "cannot enable blob resources without udmabuf"); + if (!virtio_gpu_rutabaga_enabled(g->parent_obj.conf) && + !virtio_gpu_have_udmabuf()) { + error_setg(errp, "need rutabaga or udmabuf for blob resources"); return; } @@ -1511,6 +1508,7 @@ static Property virtio_gpu_properties[] = { 256 * MiB), DEFINE_PROP_BIT("blob", VirtIOGPU, parent_obj.conf.flags, VIRTIO_GPU_FLAG_BLOB_ENABLED, false), + DEFINE_PROP_SIZE("hostmem", VirtIOGPU, parent_obj.conf.hostmem, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/display/virtio-vga-rutabaga.c b/hw/display/virtio-vga-rutabaga.c new file mode 100644 index 0000000..a7bef6d --- /dev/null +++ b/hw/display/virtio-vga-rutabaga.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "qemu/osdep.h" +#include "hw/pci/pci.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-gpu.h" +#include "hw/display/vga.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "virtio-vga.h" +#include "qom/object.h" + +#define TYPE_VIRTIO_VGA_RUTABAGA "virtio-vga-rutabaga" + +OBJECT_DECLARE_SIMPLE_TYPE(VirtIOVGARutabaga, VIRTIO_VGA_RUTABAGA) + +struct VirtIOVGARutabaga { + VirtIOVGABase parent_obj; + + VirtIOGPURutabaga vdev; +}; + +static void virtio_vga_rutabaga_inst_initfn(Object *obj) +{ + VirtIOVGARutabaga *dev = VIRTIO_VGA_RUTABAGA(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_GPU_RUTABAGA); + VIRTIO_VGA_BASE(dev)->vgpu = VIRTIO_GPU_BASE(&dev->vdev); +} + +static VirtioPCIDeviceTypeInfo virtio_vga_rutabaga_info = { + .generic_name = TYPE_VIRTIO_VGA_RUTABAGA, + .parent = TYPE_VIRTIO_VGA_BASE, + .instance_size = sizeof(VirtIOVGARutabaga), + .instance_init = virtio_vga_rutabaga_inst_initfn, +}; +module_obj(TYPE_VIRTIO_VGA_RUTABAGA); +module_kconfig(VIRTIO_VGA); + +static void virtio_vga_register_types(void) +{ + if (have_vga) { + virtio_pci_types_register(&virtio_vga_rutabaga_info); + } +} + +type_init(virtio_vga_register_types) + +module_dep("hw-display-virtio-vga"); diff --git a/hw/display/virtio-vga.c b/hw/display/virtio-vga.c index e6fb0aa..c8552ff 100644 --- a/hw/display/virtio-vga.c +++ b/hw/display/virtio-vga.c @@ -115,17 +115,32 @@ static void virtio_vga_base_realize(VirtIOPCIProxy *vpci_dev, Error **errp) pci_register_bar(&vpci_dev->pci_dev, 0, PCI_BASE_ADDRESS_MEM_PREFETCH, &vga->vram); - /* - * Configure virtio bar and regions - * - * We use bar #2 for the mmio regions, to be compatible with stdvga. - * virtio regions are moved to the end of bar #2, to make room for - * the stdvga mmio registers at the start of bar #2. - */ - vpci_dev->modern_mem_bar_idx = 2; - vpci_dev->msix_bar_idx = 4; vpci_dev->modern_io_bar_idx = 5; + if (!virtio_gpu_hostmem_enabled(g->conf)) { + /* + * Configure virtio bar and regions + * + * We use bar #2 for the mmio regions, to be compatible with stdvga. + * virtio regions are moved to the end of bar #2, to make room for + * the stdvga mmio registers at the start of bar #2. + */ + vpci_dev->modern_mem_bar_idx = 2; + vpci_dev->msix_bar_idx = 4; + } else { + vpci_dev->msix_bar_idx = 1; + vpci_dev->modern_mem_bar_idx = 2; + memory_region_init(&g->hostmem, OBJECT(g), "virtio-gpu-hostmem", + g->conf.hostmem); + pci_register_bar(&vpci_dev->pci_dev, 4, + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_PREFETCH | + PCI_BASE_ADDRESS_MEM_TYPE_64, + &g->hostmem); + virtio_pci_add_shm_cap(vpci_dev, 4, 0, g->conf.hostmem, + VIRTIO_GPU_SHM_ID_HOST_VISIBLE); + } + if (!(vpci_dev->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)) { /* * with page-per-vq=off there is no padding space we can use diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index abebd00..af1f4bc 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1435,6 +1435,24 @@ static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, return offset; } +int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, + uint8_t bar, uint64_t offset, uint64_t length, + uint8_t id) +{ + struct virtio_pci_cap64 cap = { + .cap.cap_len = sizeof cap, + .cap.cfg_type = VIRTIO_PCI_CAP_SHARED_MEMORY_CFG, + }; + + cap.cap.bar = bar; + cap.cap.length = cpu_to_le32(length); + cap.length_hi = cpu_to_le32(length >> 32); + cap.cap.offset = cpu_to_le32(offset); + cap.offset_hi = cpu_to_le32(offset >> 32); + cap.cap.id = id; + return virtio_pci_add_mem_cap(proxy, &cap.cap); +} + static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, unsigned size) { diff --git a/include/hw/virtio/virtio-gpu-bswap.h b/include/hw/virtio/virtio-gpu-bswap.h index 637a058..dd1975e 100644 --- a/include/hw/virtio/virtio-gpu-bswap.h +++ b/include/hw/virtio/virtio-gpu-bswap.h @@ -71,6 +71,21 @@ virtio_gpu_create_blob_bswap(struct virtio_gpu_resource_create_blob *cblob) } static inline void +virtio_gpu_map_blob_bswap(struct virtio_gpu_resource_map_blob *mblob) +{ + virtio_gpu_ctrl_hdr_bswap(&mblob->hdr); + le32_to_cpus(&mblob->resource_id); + le64_to_cpus(&mblob->offset); +} + +static inline void +virtio_gpu_unmap_blob_bswap(struct virtio_gpu_resource_unmap_blob *ublob) +{ + virtio_gpu_ctrl_hdr_bswap(&ublob->hdr); + le32_to_cpus(&ublob->resource_id); +} + +static inline void virtio_gpu_scanout_blob_bswap(struct virtio_gpu_set_scanout_blob *ssb) { virtio_gpu_bswap_32(ssb, sizeof(*ssb) - sizeof(ssb->offsets[3])); diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h index 4739fa4..584ba2e 100644 --- a/include/hw/virtio/virtio-gpu.h +++ b/include/hw/virtio/virtio-gpu.h @@ -38,6 +38,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIOGPUGL, VIRTIO_GPU_GL) #define TYPE_VHOST_USER_GPU "vhost-user-gpu" OBJECT_DECLARE_SIMPLE_TYPE(VhostUserGPU, VHOST_USER_GPU) +#define TYPE_VIRTIO_GPU_RUTABAGA "virtio-gpu-rutabaga-device" +OBJECT_DECLARE_SIMPLE_TYPE(VirtIOGPURutabaga, VIRTIO_GPU_RUTABAGA) + struct virtio_gpu_simple_resource { uint32_t resource_id; uint32_t width; @@ -93,6 +96,8 @@ enum virtio_gpu_base_conf_flags { VIRTIO_GPU_FLAG_EDID_ENABLED, VIRTIO_GPU_FLAG_DMABUF_ENABLED, VIRTIO_GPU_FLAG_BLOB_ENABLED, + VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED, + VIRTIO_GPU_FLAG_RUTABAGA_ENABLED, }; #define virtio_gpu_virgl_enabled(_cfg) \ @@ -105,12 +110,19 @@ enum virtio_gpu_base_conf_flags { (_cfg.flags & (1 << VIRTIO_GPU_FLAG_DMABUF_ENABLED)) #define virtio_gpu_blob_enabled(_cfg) \ (_cfg.flags & (1 << VIRTIO_GPU_FLAG_BLOB_ENABLED)) +#define virtio_gpu_context_init_enabled(_cfg) \ + (_cfg.flags & (1 << VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED)) +#define virtio_gpu_rutabaga_enabled(_cfg) \ + (_cfg.flags & (1 << VIRTIO_GPU_FLAG_RUTABAGA_ENABLED)) +#define virtio_gpu_hostmem_enabled(_cfg) \ + (_cfg.hostmem > 0) struct virtio_gpu_base_conf { uint32_t max_outputs; uint32_t flags; uint32_t xres; uint32_t yres; + uint64_t hostmem; }; struct virtio_gpu_ctrl_command { @@ -134,6 +146,8 @@ struct VirtIOGPUBase { int renderer_blocked; int enable; + MemoryRegion hostmem; + struct virtio_gpu_scanout scanout[VIRTIO_GPU_MAX_SCANOUTS]; int enabled_output_bitmask; @@ -224,6 +238,27 @@ struct VhostUserGPU { bool backend_blocked; }; +#define MAX_SLOTS 4096 + +struct MemoryRegionInfo { + int used; + MemoryRegion mr; + uint32_t resource_id; +}; + +struct rutabaga; + +struct VirtIOGPURutabaga { + VirtIOGPU parent_obj; + struct MemoryRegionInfo memory_regions[MAX_SLOTS]; + uint64_t capset_mask; + char *wayland_socket_path; + char *wsi; + bool headless; + uint32_t num_capsets; + struct rutabaga *rutabaga; +}; + #define VIRTIO_GPU_FILL_CMD(out) do { \ size_t virtiogpufillcmd_s_ = \ iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, 0, \ @@ -249,6 +284,9 @@ void virtio_gpu_base_fill_display_info(VirtIOGPUBase *g, void virtio_gpu_base_generate_edid(VirtIOGPUBase *g, int scanout, struct virtio_gpu_resp_edid *edid); /* virtio-gpu.c */ +struct virtio_gpu_simple_resource * +virtio_gpu_find_resource(VirtIOGPU *g, uint32_t resource_id); + void virtio_gpu_ctrl_response(VirtIOGPU *g, struct virtio_gpu_ctrl_command *cmd, struct virtio_gpu_ctrl_hdr *resp, @@ -267,6 +305,8 @@ int virtio_gpu_create_mapping_iov(VirtIOGPU *g, uint32_t *niov); void virtio_gpu_cleanup_mapping_iov(VirtIOGPU *g, struct iovec *iov, uint32_t count); +void virtio_gpu_cleanup_mapping(VirtIOGPU *g, + struct virtio_gpu_simple_resource *res); void virtio_gpu_process_cmdq(VirtIOGPU *g); void virtio_gpu_device_realize(DeviceState *qdev, Error **errp); void virtio_gpu_reset(VirtIODevice *vdev); diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h index ab2051b..5a3f182 100644 --- a/include/hw/virtio/virtio-pci.h +++ b/include/hw/virtio/virtio-pci.h @@ -264,4 +264,8 @@ unsigned virtio_pci_optimal_num_queues(unsigned fixed_queues); void virtio_pci_set_guest_notifier_fd_handler(VirtIODevice *vdev, VirtQueue *vq, int n, bool assign, bool with_irqfd); + +int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, uint8_t bar, uint64_t offset, + uint64_t length, uint8_t id); + #endif diff --git a/include/migration/register.h b/include/migration/register.h index 2b12c6a..fed1d04 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -25,6 +25,7 @@ typedef struct SaveVMHandlers { * used to perform early checks. */ int (*save_prepare)(void *opaque, Error **errp); + int (*save_setup)(QEMUFile *f, void *opaque); void (*save_cleanup)(void *opaque); int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque); int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); @@ -50,7 +51,6 @@ typedef struct SaveVMHandlers { int (*save_live_iterate)(QEMUFile *f, void *opaque); /* This runs outside the iothread lock! */ - int (*save_setup)(QEMUFile *f, void *opaque); /* Note for save_live_pending: * must_precopy: * - must be migrated in precopy or in stopped state diff --git a/meson.build b/meson.build index 0182622..259dc5f 100644 --- a/meson.build +++ b/meson.build @@ -1046,6 +1046,12 @@ if not get_option('virglrenderer').auto() or have_system or have_vhost_user_gpu dependencies: virgl)) endif endif +rutabaga = not_found +if not get_option('rutabaga_gfx').auto() or have_system or have_vhost_user_gpu + rutabaga = dependency('rutabaga_gfx_ffi', + method: 'pkg-config', + required: get_option('rutabaga_gfx')) +endif blkio = not_found if not get_option('blkio').auto() or have_block blkio = dependency('blkio', @@ -4284,6 +4290,7 @@ summary_info += {'libtasn1': tasn1} summary_info += {'PAM': pam} summary_info += {'iconv support': iconv} summary_info += {'virgl support': virgl} +summary_info += {'rutabaga support': rutabaga} summary_info += {'blkio support': blkio} summary_info += {'curl support': curl} summary_info += {'Multipath support': mpathpersist} diff --git a/meson_options.txt b/meson_options.txt index 1b0c02b..3c7398f 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -232,6 +232,8 @@ option('vmnet', type : 'feature', value : 'auto', description: 'vmnet.framework network backend support') option('virglrenderer', type : 'feature', value : 'auto', description: 'virgl rendering support') +option('rutabaga_gfx', type : 'feature', value : 'auto', + description: 'rutabaga_gfx support') option('png', type : 'feature', value : 'auto', description: 'PNG support with libpng') option('vnc', type : 'feature', value : 'auto', diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c index 032fc5f..03cb2e7 100644 --- a/migration/block-dirty-bitmap.c +++ b/migration/block-dirty-bitmap.c @@ -1214,9 +1214,7 @@ static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque) DBMSaveState *s = &((DBMState *)opaque)->save; SaveBitmapState *dbms = NULL; - qemu_mutex_lock_iothread(); if (init_dirty_bitmap_migration(s) < 0) { - qemu_mutex_unlock_iothread(); return -1; } @@ -1224,7 +1222,6 @@ static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque) send_bitmap_start(f, s, dbms); } qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS); - qemu_mutex_unlock_iothread(); return 0; } diff --git a/migration/block.c b/migration/block.c index d115e1c..b60698d 100644 --- a/migration/block.c +++ b/migration/block.c @@ -731,18 +731,13 @@ static int block_save_setup(QEMUFile *f, void *opaque) trace_migration_block_save("setup", block_mig_state.submitted, block_mig_state.transferred); - qemu_mutex_lock_iothread(); ret = init_blk_migration(f); if (ret < 0) { - qemu_mutex_unlock_iothread(); return ret; } /* start track dirty blocks */ ret = set_dirty_tracking(); - - qemu_mutex_unlock_iothread(); - if (ret) { return ret; } diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 5b25ba2..d206700 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -321,6 +321,10 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %" PRIu64 " bytes/second\n", MigrationParameter_str(MIGRATION_PARAMETER_MAX_BANDWIDTH), params->max_bandwidth); + assert(params->has_avail_switchover_bandwidth); + monitor_printf(mon, "%s: %" PRIu64 " bytes/second\n", + MigrationParameter_str(MIGRATION_PARAMETER_AVAIL_SWITCHOVER_BANDWIDTH), + params->avail_switchover_bandwidth); assert(params->has_downtime_limit); monitor_printf(mon, "%s: %" PRIu64 " ms\n", MigrationParameter_str(MIGRATION_PARAMETER_DOWNTIME_LIMIT), @@ -574,6 +578,16 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) } p->max_bandwidth = valuebw; break; + case MIGRATION_PARAMETER_AVAIL_SWITCHOVER_BANDWIDTH: + p->has_avail_switchover_bandwidth = true; + ret = qemu_strtosz_MiB(valuestr, NULL, &valuebw); + if (ret < 0 || valuebw > INT64_MAX + || (size_t)valuebw != valuebw) { + error_setg(&err, "Invalid size %s", valuestr); + break; + } + p->avail_switchover_bandwidth = valuebw; + break; case MIGRATION_PARAMETER_DOWNTIME_LIMIT: p->has_downtime_limit = true; visit_type_size(v, param, &p->downtime_limit, &err); diff --git a/migration/migration-stats.c b/migration/migration-stats.c index 84e11e6..4cc989d 100644 --- a/migration/migration-stats.c +++ b/migration/migration-stats.c @@ -24,14 +24,15 @@ bool migration_rate_exceeded(QEMUFile *f) return true; } + uint64_t rate_limit_max = migration_rate_get(); + if (rate_limit_max == RATE_LIMIT_DISABLED) { + return false; + } + uint64_t rate_limit_start = stat64_get(&mig_stats.rate_limit_start); uint64_t rate_limit_current = migration_transferred_bytes(f); uint64_t rate_limit_used = rate_limit_current - rate_limit_start; - uint64_t rate_limit_max = stat64_get(&mig_stats.rate_limit_max); - if (rate_limit_max == RATE_LIMIT_DISABLED) { - return false; - } if (rate_limit_max > 0 && rate_limit_used > rate_limit_max) { return true; } diff --git a/migration/migration.c b/migration/migration.c index 1c6c81a..6ba5e14 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -99,7 +99,7 @@ static int migration_maybe_pause(MigrationState *s, int *current_active_state, int new_state); static void migrate_fd_cancel(MigrationState *s); -static int await_return_path_close_on_source(MigrationState *s); +static int close_return_path_on_source(MigrationState *s); static bool migration_needs_multiple_sockets(void) { @@ -1191,7 +1191,7 @@ static void migrate_fd_cleanup(MigrationState *s) * We already cleaned up to_dst_file, so errors from the return * path might be due to that, ignore them. */ - await_return_path_close_on_source(s); + close_return_path_on_source(s); assert(!migration_is_active(s)); @@ -1442,6 +1442,7 @@ int migrate_init(MigrationState *s, Error **errp) error_free(s->error); s->error = NULL; s->hostname = NULL; + s->vmdesc = NULL; migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); @@ -1451,6 +1452,7 @@ int migrate_init(MigrationState *s, Error **errp) s->iteration_initial_bytes = 0; s->threshold_size = 0; s->switchover_acked = false; + s->rdma_migration = false; /* * set mig_stats compression_counters memory to zero for a * new migration @@ -2049,8 +2051,7 @@ static int open_return_path_on_source(MigrationState *ms) return 0; } -/* Returns 0 if the RP was ok, otherwise there was an error on the RP */ -static int await_return_path_close_on_source(MigrationState *ms) +static int close_return_path_on_source(MigrationState *ms) { int ret; @@ -2317,70 +2318,111 @@ static int migration_maybe_pause(MigrationState *s, return s->state == new_state ? 0 : -EINVAL; } -/** - * migration_completion: Used by migration_thread when there's not much left. - * The caller 'breaks' the loop when this returns. - * - * @s: Current migration state - */ -static void migration_completion(MigrationState *s) +static int migration_completion_precopy(MigrationState *s, + int *current_active_state) { int ret; - int current_active_state = s->state; - if (s->state == MIGRATION_STATUS_ACTIVE) { - qemu_mutex_lock_iothread(); - s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); + qemu_mutex_lock_iothread(); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); - s->vm_old_state = runstate_get(); - global_state_store(); + s->vm_old_state = runstate_get(); + global_state_store(); - ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - trace_migration_completion_vm_stop(ret); - if (ret >= 0) { - ret = migration_maybe_pause(s, ¤t_active_state, - MIGRATION_STATUS_DEVICE); - } - if (ret >= 0) { - /* - * Inactivate disks except in COLO, and track that we - * have done so in order to remember to reactivate - * them if migration fails or is cancelled. - */ - s->block_inactive = !migrate_colo(); - migration_rate_set(RATE_LIMIT_DISABLED); - ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, - s->block_inactive); - } + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + trace_migration_completion_vm_stop(ret); + if (ret < 0) { + goto out_unlock; + } - qemu_mutex_unlock_iothread(); + ret = migration_maybe_pause(s, current_active_state, + MIGRATION_STATUS_DEVICE); + if (ret < 0) { + goto out_unlock; + } - if (ret < 0) { - goto fail; - } - } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { - trace_migration_completion_postcopy_end(); + /* + * Inactivate disks except in COLO, and track that we have done so in order + * to remember to reactivate them if migration fails or is cancelled. + */ + s->block_inactive = !migrate_colo(); + migration_rate_set(RATE_LIMIT_DISABLED); + ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, + s->block_inactive); +out_unlock: + qemu_mutex_unlock_iothread(); + return ret; +} - qemu_mutex_lock_iothread(); - qemu_savevm_state_complete_postcopy(s->to_dst_file); - qemu_mutex_unlock_iothread(); +static void migration_completion_postcopy(MigrationState *s) +{ + trace_migration_completion_postcopy_end(); + + qemu_mutex_lock_iothread(); + qemu_savevm_state_complete_postcopy(s->to_dst_file); + qemu_mutex_unlock_iothread(); + + /* + * Shutdown the postcopy fast path thread. This is only needed when dest + * QEMU binary is old (7.1/7.2). QEMU 8.0+ doesn't need this. + */ + if (migrate_postcopy_preempt() && s->preempt_pre_7_2) { + postcopy_preempt_shutdown_file(s); + } + + trace_migration_completion_postcopy_end_after_complete(); +} +static void migration_completion_failed(MigrationState *s, + int current_active_state) +{ + if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE || + s->state == MIGRATION_STATUS_DEVICE)) { /* - * Shutdown the postcopy fast path thread. This is only needed - * when dest QEMU binary is old (7.1/7.2). QEMU 8.0+ doesn't need - * this. + * If not doing postcopy, vm_start() will be called: let's + * regain control on images. */ - if (migrate_postcopy_preempt() && s->preempt_pre_7_2) { - postcopy_preempt_shutdown_file(s); + Error *local_err = NULL; + + qemu_mutex_lock_iothread(); + bdrv_activate_all(&local_err); + if (local_err) { + error_report_err(local_err); + } else { + s->block_inactive = false; } + qemu_mutex_unlock_iothread(); + } - trace_migration_completion_postcopy_end_after_complete(); + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_FAILED); +} + +/** + * migration_completion: Used by migration_thread when there's not much left. + * The caller 'breaks' the loop when this returns. + * + * @s: Current migration state + */ +static void migration_completion(MigrationState *s) +{ + int ret = 0; + int current_active_state = s->state; + + if (s->state == MIGRATION_STATUS_ACTIVE) { + ret = migration_completion_precopy(s, ¤t_active_state); + } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { + migration_completion_postcopy(s); } else { + ret = -1; + } + + if (ret < 0) { goto fail; } - if (await_return_path_close_on_source(s)) { + if (close_return_path_on_source(s)) { goto fail; } @@ -2401,26 +2443,7 @@ static void migration_completion(MigrationState *s) return; fail: - if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_DEVICE)) { - /* - * If not doing postcopy, vm_start() will be called: let's - * regain control on images. - */ - Error *local_err = NULL; - - qemu_mutex_lock_iothread(); - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - } else { - s->block_inactive = false; - } - qemu_mutex_unlock_iothread(); - } - - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); + migration_completion_failed(s, current_active_state); } /** @@ -2563,7 +2586,7 @@ static MigThrError postcopy_pause(MigrationState *s) * path and just wait for the thread to finish. It will be * re-created when we resume. */ - await_return_path_close_on_source(s); + close_return_path_on_source(s); migrate_set_state(&s->state, s->state, MIGRATION_STATUS_POSTCOPY_PAUSED); @@ -2689,17 +2712,33 @@ static void migration_update_counters(MigrationState *s, { uint64_t transferred, transferred_pages, time_spent; uint64_t current_bytes; /* bytes transferred since the beginning */ + uint64_t switchover_bw; + /* Expected bandwidth when switching over to destination QEMU */ + double expected_bw_per_ms; double bandwidth; if (current_time < s->iteration_start_time + BUFFER_DELAY) { return; } + switchover_bw = migrate_avail_switchover_bandwidth(); current_bytes = migration_transferred_bytes(s->to_dst_file); transferred = current_bytes - s->iteration_initial_bytes; time_spent = current_time - s->iteration_start_time; bandwidth = (double)transferred / time_spent; - s->threshold_size = bandwidth * migrate_downtime_limit(); + + if (switchover_bw) { + /* + * If the user specified a switchover bandwidth, let's trust the + * user so that can be more accurate than what we estimated. + */ + expected_bw_per_ms = switchover_bw / 1000; + } else { + /* If the user doesn't specify bandwidth, we use the estimated */ + expected_bw_per_ms = bandwidth; + } + + s->threshold_size = expected_bw_per_ms * migrate_downtime_limit(); s->mbps = (((double) transferred * 8.0) / ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; @@ -2716,7 +2755,7 @@ static void migration_update_counters(MigrationState *s, if (stat64_get(&mig_stats.dirty_pages_rate) && transferred > 10000) { s->expected_downtime = - stat64_get(&mig_stats.dirty_bytes_last_sync) / bandwidth; + stat64_get(&mig_stats.dirty_bytes_last_sync) / expected_bw_per_ms; } migration_rate_reset(s->to_dst_file); @@ -2724,7 +2763,9 @@ static void migration_update_counters(MigrationState *s, update_iteration_initial_status(s); trace_migrate_transferred(transferred, time_spent, - bandwidth, s->threshold_size); + /* Both in unit bytes/ms */ + bandwidth, switchover_bw / 1000, + s->threshold_size); } static bool migration_can_switchover(MigrationState *s) @@ -2980,7 +3021,9 @@ static void *migration_thread(void *opaque) object_ref(OBJECT(s)); update_iteration_initial_status(s); + qemu_mutex_lock_iothread(); qemu_savevm_state_header(s->to_dst_file); + qemu_mutex_unlock_iothread(); /* * If we opened the return path, we need to make sure dst has it @@ -3008,7 +3051,9 @@ static void *migration_thread(void *opaque) qemu_savevm_send_colo_enable(s->to_dst_file); } + qemu_mutex_lock_iothread(); qemu_savevm_state_setup(s->to_dst_file); + qemu_mutex_unlock_iothread(); qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE); @@ -3119,8 +3164,10 @@ static void *bg_migration_thread(void *opaque) ram_write_tracking_prepare(); #endif + qemu_mutex_lock_iothread(); qemu_savevm_state_header(s->to_dst_file); qemu_savevm_state_setup(s->to_dst_file); + qemu_mutex_unlock_iothread(); qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE); diff --git a/migration/migration.h b/migration/migration.h index cd55343..ae82004 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -294,7 +294,7 @@ struct MigrationState { /* * The final stage happens when the remaining data is smaller than * this threshold; it's calculated from the requested downtime and - * measured bandwidth + * measured bandwidth, or avail-switchover-bandwidth if specified. */ int64_t threshold_size; @@ -469,6 +469,8 @@ struct MigrationState { * switchover has been received. */ bool switchover_acked; + /* Is this a rdma migration */ + bool rdma_migration; }; void migrate_set_state(int *state, int old_state, int new_state); diff --git a/migration/multifd.c b/migration/multifd.c index 0f6b203..1fe53d3 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -510,6 +510,11 @@ static void multifd_send_terminate_threads(Error *err) } } +static int multifd_send_channel_destroy(QIOChannel *send) +{ + return socket_send_channel_destroy(send); +} + void multifd_save_cleanup(void) { int i; @@ -532,7 +537,7 @@ void multifd_save_cleanup(void) if (p->registered_yank) { migration_ioc_unregister_yank(p->c); } - socket_send_channel_destroy(p->c); + multifd_send_channel_destroy(p->c); p->c = NULL; qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem); @@ -714,8 +719,6 @@ static void *multifd_send_thread(void *opaque) if (ret != 0) { break; } - stat64_add(&mig_stats.multifd_bytes, p->packet_len); - stat64_add(&mig_stats.transferred, p->packet_len); } else { /* Send header using the same writev call */ p->iov[0].iov_len = p->packet_len; @@ -728,8 +731,11 @@ static void *multifd_send_thread(void *opaque) break; } - stat64_add(&mig_stats.multifd_bytes, p->next_packet_size); - stat64_add(&mig_stats.transferred, p->next_packet_size); + stat64_add(&mig_stats.multifd_bytes, + p->next_packet_size + p->packet_len); + stat64_add(&mig_stats.transferred, + p->next_packet_size + p->packet_len); + p->next_packet_size = 0; qemu_mutex_lock(&p->mutex); p->pending_job--; qemu_mutex_unlock(&p->mutex); @@ -747,19 +753,13 @@ static void *multifd_send_thread(void *opaque) } out: - if (local_err) { + if (ret) { + assert(local_err); trace_multifd_send_error(p->id); multifd_send_terminate_threads(local_err); - error_free(local_err); - } - - /* - * Error happen, I will exit, but I can't just leave, tell - * who pay attention to me. - */ - if (ret != 0) { qemu_sem_post(&p->sem_sync); qemu_sem_post(&multifd_send_state->channels_ready); + error_free(local_err); } qemu_mutex_lock(&p->mutex); @@ -775,7 +775,7 @@ out: static bool multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc, - Error *error); + Error **errp); static void multifd_tls_outgoing_handshake(QIOTask *task, gpointer opaque) @@ -784,21 +784,22 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); Error *err = NULL; - if (qio_task_propagate_error(task, &err)) { - trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - } else { + if (!qio_task_propagate_error(task, &err)) { trace_multifd_tls_outgoing_handshake_complete(ioc); + if (multifd_channel_connect(p, ioc, &err)) { + return; + } } - if (!multifd_channel_connect(p, ioc, err)) { - /* - * Error happen, mark multifd_send_thread status as 'quit' although it - * is not created, and then tell who pay attention to me. - */ - p->quit = true; - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); - } + trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); + + /* + * Error happen, mark multifd_send_thread status as 'quit' although it + * is not created, and then tell who pay attention to me. + */ + p->quit = true; + qemu_sem_post(&multifd_send_state->channels_ready); + qemu_sem_post(&p->sem_sync); } static void *multifd_tls_handshake_thread(void *opaque) @@ -814,7 +815,7 @@ static void *multifd_tls_handshake_thread(void *opaque) return NULL; } -static void multifd_tls_channel_connect(MultiFDSendParams *p, +static bool multifd_tls_channel_connect(MultiFDSendParams *p, QIOChannel *ioc, Error **errp) { @@ -824,7 +825,7 @@ static void multifd_tls_channel_connect(MultiFDSendParams *p, tioc = migration_tls_client_create(ioc, hostname, errp); if (!tioc) { - return; + return false; } object_unref(OBJECT(ioc)); @@ -834,31 +835,25 @@ static void multifd_tls_channel_connect(MultiFDSendParams *p, qemu_thread_create(&p->thread, "multifd-tls-handshake-worker", multifd_tls_handshake_thread, p, QEMU_THREAD_JOINABLE); + return true; } static bool multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc, - Error *error) + Error **errp) { trace_multifd_set_outgoing_channel( ioc, object_get_typename(OBJECT(ioc)), - migrate_get_current()->hostname, error); + migrate_get_current()->hostname); - if (error) { - return false; - } if (migrate_channel_requires_tls_upgrade(ioc)) { - multifd_tls_channel_connect(p, ioc, &error); - if (!error) { - /* - * tls_channel_connect will call back to this - * function after the TLS handshake, - * so we mustn't call multifd_send_thread until then - */ - return true; - } else { - return false; - } + /* + * tls_channel_connect will call back to this + * function after the TLS handshake, + * so we mustn't call multifd_send_thread until then + */ + return multifd_tls_channel_connect(p, ioc, errp); + } else { migration_ioc_register_yank(ioc); p->registered_yank = true; @@ -889,20 +884,26 @@ static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; - QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task)); + QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); Error *local_err = NULL; trace_multifd_new_send_channel_async(p->id); if (!qio_task_propagate_error(task, &local_err)) { - p->c = sioc; + p->c = ioc; qio_channel_set_delay(p->c, false); p->running = true; - if (multifd_channel_connect(p, sioc, local_err)) { + if (multifd_channel_connect(p, ioc, &local_err)) { return; } } - multifd_new_send_channel_cleanup(p, sioc, local_err); + trace_multifd_new_send_channel_async_error(p->id, local_err); + multifd_new_send_channel_cleanup(p, ioc, local_err); +} + +static void multifd_new_send_channel_create(gpointer opaque) +{ + socket_send_channel_create(multifd_new_send_channel_async, opaque); } int multifd_save_setup(Error **errp) @@ -951,7 +952,7 @@ int multifd_save_setup(Error **errp) p->write_flags = 0; } - socket_send_channel_create(multifd_new_send_channel_async, p); + multifd_new_send_channel_create(p); } for (i = 0; i < thread_count; i++) { diff --git a/migration/options.c b/migration/options.c index 6bbfd48..42fb818 100644 --- a/migration/options.c +++ b/migration/options.c @@ -125,6 +125,8 @@ Property migration_properties[] = { parameters.cpu_throttle_tailslow, false), DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, parameters.max_bandwidth, MAX_THROTTLE), + DEFINE_PROP_SIZE("avail-switchover-bandwidth", MigrationState, + parameters.avail_switchover_bandwidth, 0), DEFINE_PROP_UINT64("x-downtime-limit", MigrationState, parameters.downtime_limit, DEFAULT_MIGRATE_SET_DOWNTIME), @@ -376,6 +378,13 @@ bool migrate_postcopy(void) return migrate_postcopy_ram() || migrate_dirty_bitmaps(); } +bool migrate_rdma(void) +{ + MigrationState *s = migrate_get_current(); + + return s->rdma_migration; +} + bool migrate_tls(void) { MigrationState *s = migrate_get_current(); @@ -780,6 +789,13 @@ uint64_t migrate_max_bandwidth(void) return s->parameters.max_bandwidth; } +uint64_t migrate_avail_switchover_bandwidth(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.avail_switchover_bandwidth; +} + uint64_t migrate_max_postcopy_bandwidth(void) { MigrationState *s = migrate_get_current(); @@ -917,6 +933,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) s->parameters.tls_authz : ""); params->has_max_bandwidth = true; params->max_bandwidth = s->parameters.max_bandwidth; + params->has_avail_switchover_bandwidth = true; + params->avail_switchover_bandwidth = s->parameters.avail_switchover_bandwidth; params->has_downtime_limit = true; params->downtime_limit = s->parameters.downtime_limit; params->has_x_checkpoint_delay = true; @@ -1056,6 +1074,15 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) return false; } + if (params->has_avail_switchover_bandwidth && + (params->avail_switchover_bandwidth > SIZE_MAX)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "avail_switchover_bandwidth", + "an integer in the range of 0 to "stringify(SIZE_MAX) + " bytes/second"); + return false; + } + if (params->has_downtime_limit && (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, @@ -1225,6 +1252,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->max_bandwidth = params->max_bandwidth; } + if (params->has_avail_switchover_bandwidth) { + dest->avail_switchover_bandwidth = params->avail_switchover_bandwidth; + } + if (params->has_downtime_limit) { dest->downtime_limit = params->downtime_limit; } @@ -1341,6 +1372,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) } } + if (params->has_avail_switchover_bandwidth) { + s->parameters.avail_switchover_bandwidth = params->avail_switchover_bandwidth; + } + if (params->has_downtime_limit) { s->parameters.downtime_limit = params->downtime_limit; } diff --git a/migration/options.h b/migration/options.h index 045e2a4..237f2d6 100644 --- a/migration/options.h +++ b/migration/options.h @@ -56,6 +56,7 @@ bool migrate_zero_copy_send(void); bool migrate_multifd_flush_after_each_section(void); bool migrate_postcopy(void); +bool migrate_rdma(void); bool migrate_tls(void); /* capabilities helpers */ @@ -80,6 +81,7 @@ int migrate_decompress_threads(void); uint64_t migrate_downtime_limit(void); uint8_t migrate_max_cpu_throttle(void); uint64_t migrate_max_bandwidth(void); +uint64_t migrate_avail_switchover_bandwidth(void); uint64_t migrate_max_postcopy_bandwidth(void); int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 7fb6592..3fb2514 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -32,12 +32,12 @@ #include "trace.h" #include "options.h" #include "qapi/error.h" +#include "rdma.h" #define IO_BUF_SIZE 32768 #define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64) struct QEMUFile { - const QEMUFileHooks *hooks; QIOChannel *ioc; bool is_writable; @@ -132,11 +132,6 @@ QEMUFile *qemu_file_new_input(QIOChannel *ioc) return qemu_file_new_impl(ioc, false); } -void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks) -{ - f->hooks = hooks; -} - /* * Get last error for stream f with optional Error* * @@ -297,60 +292,6 @@ void qemu_fflush(QEMUFile *f) f->iovcnt = 0; } -void ram_control_before_iterate(QEMUFile *f, uint64_t flags) -{ - int ret = 0; - - if (f->hooks && f->hooks->before_ram_iterate) { - ret = f->hooks->before_ram_iterate(f, flags, NULL); - if (ret < 0) { - qemu_file_set_error(f, ret); - } - } -} - -void ram_control_after_iterate(QEMUFile *f, uint64_t flags) -{ - int ret = 0; - - if (f->hooks && f->hooks->after_ram_iterate) { - ret = f->hooks->after_ram_iterate(f, flags, NULL); - if (ret < 0) { - qemu_file_set_error(f, ret); - } - } -} - -void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data) -{ - if (f->hooks && f->hooks->hook_ram_load) { - int ret = f->hooks->hook_ram_load(f, flags, data); - if (ret < 0) { - qemu_file_set_error(f, ret); - } - } -} - -int ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, - ram_addr_t offset, size_t size) -{ - if (f->hooks && f->hooks->save_page) { - int ret = f->hooks->save_page(f, block_offset, offset, size); - /* - * RAM_SAVE_CONTROL_* are negative values - */ - if (ret != RAM_SAVE_CONTROL_DELAYED && - ret != RAM_SAVE_CONTROL_NOT_SUPP) { - if (ret < 0) { - qemu_file_set_error(f, ret); - } - } - return ret; - } - - return RAM_SAVE_CONTROL_NOT_SUPP; -} - /* * Attempt to fill the buffer from the underlying file * Returns the number of bytes read, or negative value for an error. diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 03e718c..a29c37b 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -29,41 +29,8 @@ #include "exec/cpu-common.h" #include "io/channel.h" -/* - * This function provides hooks around different - * stages of RAM migration. - * 'data' is call specific data associated with the 'flags' value - */ -typedef int (QEMURamHookFunc)(QEMUFile *f, uint64_t flags, void *data); - -/* - * Constants used by ram_control_* hooks - */ -#define RAM_CONTROL_SETUP 0 -#define RAM_CONTROL_ROUND 1 -#define RAM_CONTROL_HOOK 2 -#define RAM_CONTROL_FINISH 3 -#define RAM_CONTROL_BLOCK_REG 4 - -/* - * This function allows override of where the RAM page - * is saved (such as RDMA, for example.) - */ -typedef int (QEMURamSaveFunc)(QEMUFile *f, - ram_addr_t block_offset, - ram_addr_t offset, - size_t size); - -typedef struct QEMUFileHooks { - QEMURamHookFunc *before_ram_iterate; - QEMURamHookFunc *after_ram_iterate; - QEMURamHookFunc *hook_ram_load; - QEMURamSaveFunc *save_page; -} QEMUFileHooks; - QEMUFile *qemu_file_new_input(QIOChannel *ioc); QEMUFile *qemu_file_new_output(QIOChannel *ioc); -void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks); int qemu_fclose(QEMUFile *f); /* @@ -127,22 +94,6 @@ void qemu_fflush(QEMUFile *f); void qemu_file_set_blocking(QEMUFile *f, bool block); int qemu_file_get_to_fd(QEMUFile *f, int fd, size_t size); -void ram_control_before_iterate(QEMUFile *f, uint64_t flags); -void ram_control_after_iterate(QEMUFile *f, uint64_t flags); -void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data); - -/* Whenever this is found in the data stream, the flags - * will be passed to ram_control_load_hook in the incoming-migration - * side. This lets before_ram_iterate/after_ram_iterate add - * transport-specific sections to the RAM migration data. - */ -#define RAM_SAVE_FLAG_HOOK 0x80 - -#define RAM_SAVE_CONTROL_NOT_SUPP -1000 -#define RAM_SAVE_CONTROL_DELAYED -2000 - -int ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, - ram_addr_t offset, size_t size); QIOChannel *qemu_file_get_ioc(QEMUFile *file); #endif diff --git a/migration/ram.c b/migration/ram.c index 2f5ce4d..c844151 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -59,6 +59,7 @@ #include "qemu/iov.h" #include "multifd.h" #include "sysemu/runstate.h" +#include "rdma.h" #include "options.h" #include "sysemu/dirtylimit.h" #include "sysemu/kvm.h" @@ -88,7 +89,7 @@ #define RAM_SAVE_FLAG_EOS 0x10 #define RAM_SAVE_FLAG_CONTINUE 0x20 #define RAM_SAVE_FLAG_XBZRLE 0x40 -/* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ +/* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */ #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 /* We can't use any flag that is bigger than 0x200 */ @@ -569,7 +570,6 @@ void mig_throttle_counter_reset(void) /** * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache * - * @rs: current RAM state * @current_addr: address for the zero page * * Update the xbzrle cache to reflect a page that's been sent as all 0. @@ -578,7 +578,7 @@ void mig_throttle_counter_reset(void) * As a bonus, if the page wasn't in the cache it gets added so that * when a small write is made into the 0'd page it gets XBZRLE sent. */ -static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) +static void xbzrle_cache_zero_page(ram_addr_t current_addr) { /* We don't care if this fails to allocate a new cache page * as long as it updated an old one */ @@ -1138,50 +1138,45 @@ void ram_release_page(const char *rbname, uint64_t offset) } /** - * save_zero_page_to_file: send the zero page to the file + * save_zero_page: send the zero page to the stream * - * Returns the size of data written to the file, 0 means the page is not - * a zero page + * Returns the number of pages written. * + * @rs: current RAM state * @pss: current PSS channel * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, - RAMBlock *block, ram_addr_t offset) +static int save_zero_page(RAMState *rs, PageSearchStatus *pss, RAMBlock *block, + ram_addr_t offset) { uint8_t *p = block->host + offset; + QEMUFile *file = pss->pss_channel; int len = 0; - if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { - len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); - qemu_put_byte(file, 0); - len += 1; - ram_release_page(block->idstr, offset); + if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { + return 0; } - return len; -} -/** - * save_zero_page: send the zero page to the stream - * - * Returns the number of pages written. - * - * @pss: current PSS channel - * @block: block that contains the page we want to send - * @offset: offset inside the block for the page - */ -static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, - ram_addr_t offset) -{ - int len = save_zero_page_to_file(pss, f, block, offset); + len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); + qemu_put_byte(file, 0); + len += 1; + ram_release_page(block->idstr, offset); - if (len) { - stat64_add(&mig_stats.zero_pages, 1); - ram_transferred_add(len); - return 1; + stat64_add(&mig_stats.zero_pages, 1); + ram_transferred_add(len); + + /* + * Must let xbzrle know, otherwise a previous (now 0'd) cached + * page would be stale. + */ + if (rs->xbzrle_started) { + XBZRLE_cache_lock(); + xbzrle_cache_zero_page(block->offset + offset); + XBZRLE_cache_unlock(); } - return -1; + + return len; } /* @@ -1196,8 +1191,8 @@ static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, { int ret; - ret = ram_control_save_page(pss->pss_channel, block->offset, offset, - TARGET_PAGE_SIZE); + ret = rdma_control_save_page(pss->pss_channel, block->offset, offset, + TARGET_PAGE_SIZE); if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { return false; } @@ -1395,7 +1390,8 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) pss->page = 0; pss->block = QLIST_NEXT_RCU(pss->block, next); if (!pss->block) { - if (!migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && + !migrate_multifd_flush_after_each_section()) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; int ret = multifd_send_sync_main(f); if (ret < 0) { @@ -2137,17 +2133,8 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) return 1; } - res = save_zero_page(pss, pss->pss_channel, block, offset); - if (res > 0) { - /* Must let xbzrle know, otherwise a previous (now 0'd) cached - * page would be stale - */ - if (rs->xbzrle_started) { - XBZRLE_cache_lock(); - xbzrle_cache_zero_page(rs, block->offset + offset); - XBZRLE_cache_unlock(); - } - return res; + if (save_zero_page(rs, pss, block, offset)) { + return 1; } /* @@ -2891,8 +2878,6 @@ static void migration_bitmap_clear_discarded_pages(RAMState *rs) static void ram_init_bitmaps(RAMState *rs) { - /* For memory_global_dirty_log_start below. */ - qemu_mutex_lock_iothread(); qemu_mutex_lock_ramlist(); WITH_RCU_READ_LOCK_GUARD() { @@ -2904,7 +2889,6 @@ static void ram_init_bitmaps(RAMState *rs) } } qemu_mutex_unlock_ramlist(); - qemu_mutex_unlock_iothread(); /* * After an eventual first bitmap sync, fixup the initial bitmap @@ -3062,17 +3046,27 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } } - ram_control_before_iterate(f, RAM_CONTROL_SETUP); - ram_control_after_iterate(f, RAM_CONTROL_SETUP); + ret = rdma_registration_start(f, RAM_CONTROL_SETUP); + if (ret < 0) { + qemu_file_set_error(f, ret); + } + + ret = rdma_registration_stop(f, RAM_CONTROL_SETUP); + if (ret < 0) { + qemu_file_set_error(f, ret); + } migration_ops = g_malloc0(sizeof(MigrationOps)); migration_ops->ram_save_target_page = ram_save_target_page_legacy; + + qemu_mutex_unlock_iothread(); ret = multifd_send_sync_main(f); + qemu_mutex_lock_iothread(); if (ret < 0) { return ret; } - if (!migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); } @@ -3122,7 +3116,10 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) /* Read version before ram_list.blocks */ smp_rmb(); - ram_control_before_iterate(f, RAM_CONTROL_ROUND); + ret = rdma_registration_start(f, RAM_CONTROL_ROUND); + if (ret < 0) { + qemu_file_set_error(f, ret); + } t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); i = 0; @@ -3179,12 +3176,15 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) * Must occur before EOS (or any QEMUFile operation) * because of RDMA protocol. */ - ram_control_after_iterate(f, RAM_CONTROL_ROUND); + ret = rdma_registration_stop(f, RAM_CONTROL_ROUND); + if (ret < 0) { + qemu_file_set_error(f, ret); + } out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { - if (migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && migrate_multifd_flush_after_each_section()) { ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); if (ret < 0) { return ret; @@ -3227,7 +3227,10 @@ static int ram_save_complete(QEMUFile *f, void *opaque) migration_bitmap_sync_precopy(rs, true); } - ram_control_before_iterate(f, RAM_CONTROL_FINISH); + ret = rdma_registration_start(f, RAM_CONTROL_FINISH); + if (ret < 0) { + qemu_file_set_error(f, ret); + } /* try transferring iterative blocks of memory */ @@ -3249,7 +3252,11 @@ static int ram_save_complete(QEMUFile *f, void *opaque) qemu_mutex_unlock(&rs->bitmap_mutex); ram_flush_compressed_data(rs); - ram_control_after_iterate(f, RAM_CONTROL_FINISH); + + int ret = rdma_registration_stop(f, RAM_CONTROL_FINISH); + if (ret < 0) { + qemu_file_set_error(f, ret); + } } if (ret < 0) { @@ -3261,7 +3268,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) return ret; } - if (!migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); @@ -3768,7 +3775,8 @@ int ram_load_postcopy(QEMUFile *f, int channel) break; case RAM_SAVE_FLAG_EOS: /* normal exit */ - if (migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && + migrate_multifd_flush_after_each_section()) { multifd_recv_sync_main(); } break; @@ -3861,6 +3869,85 @@ void colo_flush_ram_cache(void) trace_colo_flush_ram_cache_end(); } +static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) +{ + int ret = 0; + /* ADVISE is earlier, it shows the source has the postcopy capability on */ + bool postcopy_advised = migration_incoming_postcopy_advised(); + + assert(block); + + if (!qemu_ram_is_migratable(block)) { + error_report("block %s should not be migrated !", block->idstr); + return -EINVAL; + } + + if (length != block->used_length) { + Error *local_err = NULL; + + ret = qemu_ram_resize(block, length, &local_err); + if (local_err) { + error_report_err(local_err); + } + } + /* For postcopy we need to check hugepage sizes match */ + if (postcopy_advised && migrate_postcopy_ram() && + block->page_size != qemu_host_page_size) { + uint64_t remote_page_size = qemu_get_be64(f); + if (remote_page_size != block->page_size) { + error_report("Mismatched RAM page size %s " + "(local) %zd != %" PRId64, block->idstr, + block->page_size, remote_page_size); + ret = -EINVAL; + } + } + if (migrate_ignore_shared()) { + hwaddr addr = qemu_get_be64(f); + if (migrate_ram_is_ignored(block) && + block->mr->addr != addr) { + error_report("Mismatched GPAs for block %s " + "%" PRId64 "!= %" PRId64, block->idstr, + (uint64_t)addr, (uint64_t)block->mr->addr); + ret = -EINVAL; + } + } + ret = rdma_block_notification_handle(f, block->idstr); + if (ret < 0) { + qemu_file_set_error(f, ret); + } + + return ret; +} + +static int parse_ramblocks(QEMUFile *f, ram_addr_t total_ram_bytes) +{ + int ret = 0; + + /* Synchronize RAM block list */ + while (!ret && total_ram_bytes) { + RAMBlock *block; + char id[256]; + ram_addr_t length; + int len = qemu_get_byte(f); + + qemu_get_buffer(f, (uint8_t *)id, len); + id[len] = 0; + length = qemu_get_be64(f); + + block = qemu_ram_block_by_name(id); + if (block) { + ret = parse_ramblock(f, block, length); + } else { + error_report("Unknown ramblock \"%s\", cannot accept " + "migration", id); + ret = -EINVAL; + } + total_ram_bytes -= length; + } + + return ret; +} + /** * ram_load_precopy: load pages in precopy case * @@ -3875,14 +3962,13 @@ static int ram_load_precopy(QEMUFile *f) { MigrationIncomingState *mis = migration_incoming_get_current(); int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; - /* ADVISE is earlier, it shows the source has the postcopy capability on */ - bool postcopy_advised = migration_incoming_postcopy_advised(); + if (!migrate_compress()) { invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; } while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { - ram_addr_t addr, total_ram_bytes; + ram_addr_t addr; void *host = NULL, *host_bak = NULL; uint8_t ch; @@ -3953,65 +4039,7 @@ static int ram_load_precopy(QEMUFile *f) switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { case RAM_SAVE_FLAG_MEM_SIZE: - /* Synchronize RAM block list */ - total_ram_bytes = addr; - while (!ret && total_ram_bytes) { - RAMBlock *block; - char id[256]; - ram_addr_t length; - - len = qemu_get_byte(f); - qemu_get_buffer(f, (uint8_t *)id, len); - id[len] = 0; - length = qemu_get_be64(f); - - block = qemu_ram_block_by_name(id); - if (block && !qemu_ram_is_migratable(block)) { - error_report("block %s should not be migrated !", id); - ret = -EINVAL; - } else if (block) { - if (length != block->used_length) { - Error *local_err = NULL; - - ret = qemu_ram_resize(block, length, - &local_err); - if (local_err) { - error_report_err(local_err); - } - } - /* For postcopy we need to check hugepage sizes match */ - if (postcopy_advised && migrate_postcopy_ram() && - block->page_size != qemu_host_page_size) { - uint64_t remote_page_size = qemu_get_be64(f); - if (remote_page_size != block->page_size) { - error_report("Mismatched RAM page size %s " - "(local) %zd != %" PRId64, - id, block->page_size, - remote_page_size); - ret = -EINVAL; - } - } - if (migrate_ignore_shared()) { - hwaddr addr2 = qemu_get_be64(f); - if (migrate_ram_is_ignored(block) && - block->mr->addr != addr2) { - error_report("Mismatched GPAs for block %s " - "%" PRId64 "!= %" PRId64, - id, (uint64_t)addr2, - (uint64_t)block->mr->addr); - ret = -EINVAL; - } - } - ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, - block->idstr); - } else { - error_report("Unknown ramblock \"%s\", cannot " - "accept migration", id); - ret = -EINVAL; - } - - total_ram_bytes -= length; - } + ret = parse_ramblocks(f, addr); break; case RAM_SAVE_FLAG_ZERO: @@ -4046,12 +4074,16 @@ static int ram_load_precopy(QEMUFile *f) break; case RAM_SAVE_FLAG_EOS: /* normal exit */ - if (migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && + migrate_multifd_flush_after_each_section()) { multifd_recv_sync_main(); } break; case RAM_SAVE_FLAG_HOOK: - ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); + ret = rdma_registration_handle(f); + if (ret < 0) { + qemu_file_set_error(f, ret); + } break; default: error_report("Unknown combination of migration flags: 0x%x", flags); @@ -4159,7 +4191,8 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) int ret = -EINVAL; /* from_dst_file is always valid because we're within rp_thread */ QEMUFile *file = s->rp_state.from_dst_file; - unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; + g_autofree unsigned long *le_bitmap = NULL; + unsigned long nbits = block->used_length >> TARGET_PAGE_BITS; uint64_t local_size = DIV_ROUND_UP(nbits, 8); uint64_t size, end_mark; RAMState *rs = ram_state; @@ -4188,8 +4221,7 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) error_report("%s: ramblock '%s' bitmap size mismatch " "(0x%"PRIx64" != 0x%"PRIx64")", __func__, block->idstr, size, local_size); - ret = -EINVAL; - goto out; + return -EINVAL; } size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); @@ -4200,15 +4232,13 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) error_report("%s: read bitmap failed for ramblock '%s': %d" " (size 0x%"PRIx64", got: 0x%"PRIx64")", __func__, block->idstr, ret, local_size, size); - ret = -EIO; - goto out; + return -EIO; } if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, __func__, block->idstr, end_mark); - ret = -EINVAL; - goto out; + return -EINVAL; } /* @@ -4240,10 +4270,7 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) */ migration_rp_kick(s); - ret = 0; -out: - g_free(le_bitmap); - return ret; + return 0; } static int ram_resume_prepare(MigrationState *s, void *opaque) @@ -4290,6 +4317,11 @@ static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); Error *err = NULL; + if (!rb) { + error_report("RAM block not found"); + return; + } + if (migrate_ram_is_ignored(rb)) { return; } diff --git a/migration/rdma.c b/migration/rdma.c index f6fc226..2a1852e 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -559,10 +559,8 @@ static void rdma_add_block(RDMAContext *rdma, const char *block_name, local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1); if (local->nb_blocks) { - int x; - if (rdma->blockmap) { - for (x = 0; x < local->nb_blocks; x++) { + for (int x = 0; x < local->nb_blocks; x++) { g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)old[x].offset); g_hash_table_insert(rdma->blockmap, @@ -649,15 +647,12 @@ static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) { RDMALocalBlocks *local = &rdma->local_ram_blocks; RDMALocalBlock *old = local->block; - int x; if (rdma->blockmap) { g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset); } if (block->pmr) { - int j; - - for (j = 0; j < block->nb_chunks; j++) { + for (int j = 0; j < block->nb_chunks; j++) { if (!block->pmr[j]) { continue; } @@ -687,7 +682,7 @@ static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) block->block_name = NULL; if (rdma->blockmap) { - for (x = 0; x < local->nb_blocks; x++) { + for (int x = 0; x < local->nb_blocks; x++) { g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)old[x].offset); } @@ -705,7 +700,7 @@ static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) memcpy(local->block + block->index, old + (block->index + 1), sizeof(RDMALocalBlock) * (local->nb_blocks - (block->index + 1))); - for (x = block->index; x < local->nb_blocks - 1; x++) { + for (int x = block->index; x < local->nb_blocks - 1; x++) { local->block[x].index--; } } @@ -725,7 +720,7 @@ static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) local->nb_blocks--; if (local->nb_blocks && rdma->blockmap) { - for (x = 0; x < local->nb_blocks; x++) { + for (int x = 0; x < local->nb_blocks; x++) { g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)local->block[x].offset, &local->block[x]); @@ -828,12 +823,12 @@ static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp) * Otherwise, there are no guarantees until the bug is fixed in linux. */ if (!verbs) { - int num_devices, x; + int num_devices; struct ibv_device **dev_list = ibv_get_device_list(&num_devices); bool roce_found = false; bool ib_found = false; - for (x = 0; x < num_devices; x++) { + for (int x = 0; x < num_devices; x++) { verbs = ibv_open_device(dev_list[x]); /* * ibv_open_device() is not documented to set errno. If @@ -925,7 +920,6 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) char port_str[16]; struct rdma_cm_event *cm_event; char ip[40] = "unknown"; - struct rdma_addrinfo *e; if (rdma->host == NULL || !strcmp(rdma->host, "")) { error_setg(errp, "RDMA ERROR: RDMA hostname has not been set"); @@ -957,7 +951,7 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) } /* Try all addresses, saving the first error in @err */ - for (e = res; e != NULL; e = e->ai_next) { + for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) { Error **local_errp = err ? NULL : &err; inet_ntop(e->ai_family, @@ -1113,7 +1107,6 @@ err_alloc_pd_cq: static int qemu_rdma_alloc_qp(RDMAContext *rdma) { struct ibv_qp_init_attr attr = { 0 }; - int ret; attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX; attr.cap.max_recv_wr = 3; @@ -1123,8 +1116,7 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma) attr.recv_cq = rdma->recv_cq; attr.qp_type = IBV_QPT_RC; - ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr); - if (ret < 0) { + if (rdma_create_qp(rdma->cm_id, rdma->pd, &attr) < 0) { return -1; } @@ -1136,8 +1128,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma) static bool rdma_support_odp(struct ibv_context *dev) { struct ibv_device_attr_ex attr = {0}; - int ret = ibv_query_device_ex(dev, NULL, &attr); - if (ret) { + + if (ibv_query_device_ex(dev, NULL, &attr)) { return false; } @@ -1514,7 +1506,6 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma, struct ibv_comp_channel *comp_channel) { struct rdma_cm_event *cm_event; - int ret; /* * Coroutine doesn't start until migration_fd_process_incoming() @@ -1550,8 +1541,7 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma, } if (pfds[1].revents) { - ret = rdma_get_cm_event(rdma->channel, &cm_event); - if (ret < 0) { + if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) { return -1; } @@ -2323,12 +2313,10 @@ static int qemu_rdma_write(RDMAContext *rdma, uint64_t current_addr = block_offset + offset; uint64_t index = rdma->current_index; uint64_t chunk = rdma->current_chunk; - int ret; /* If we cannot merge it, we flush the current buffer first. */ if (!qemu_rdma_buffer_mergeable(rdma, current_addr, len)) { - ret = qemu_rdma_write_flush(rdma, errp); - if (ret < 0) { + if (qemu_rdma_write_flush(rdma, errp) < 0) { return -1; } rdma->current_length = 0; @@ -2354,7 +2342,6 @@ static int qemu_rdma_write(RDMAContext *rdma, static void qemu_rdma_cleanup(RDMAContext *rdma) { Error *err = NULL; - int idx; if (rdma->cm_id && rdma->connected) { if ((rdma->errored || @@ -2381,12 +2368,12 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) g_free(rdma->dest_blocks); rdma->dest_blocks = NULL; - for (idx = 0; idx < RDMA_WRID_MAX; idx++) { - if (rdma->wr_data[idx].control_mr) { + for (int i = 0; i < RDMA_WRID_MAX; i++) { + if (rdma->wr_data[i].control_mr) { rdma->total_registrations--; - ibv_dereg_mr(rdma->wr_data[idx].control_mr); + ibv_dereg_mr(rdma->wr_data[i].control_mr); } - rdma->wr_data[idx].control_mr = NULL; + rdma->wr_data[i].control_mr = NULL; } if (rdma->local_ram_blocks.block) { @@ -2452,7 +2439,7 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp) { - int ret, idx; + int ret; /* * Will be validated against destination's actual capabilities @@ -2480,18 +2467,17 @@ static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp) /* Build the hash that maps from offset to RAMBlock */ rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); - for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) { + for (int i = 0; i < rdma->local_ram_blocks.nb_blocks; i++) { g_hash_table_insert(rdma->blockmap, - (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset, - &rdma->local_ram_blocks.block[idx]); + (void *)(uintptr_t)rdma->local_ram_blocks.block[i].offset, + &rdma->local_ram_blocks.block[i]); } - for (idx = 0; idx < RDMA_WRID_MAX; idx++) { - ret = qemu_rdma_reg_control(rdma, idx); + for (int i = 0; i < RDMA_WRID_MAX; i++) { + ret = qemu_rdma_reg_control(rdma, i); if (ret < 0) { - error_setg(errp, - "RDMA ERROR: rdma migration: error registering %d control!", - idx); + error_setg(errp, "RDMA ERROR: rdma migration: error " + "registering %d control!", i); goto err_rdma_source_init; } } @@ -2625,16 +2611,16 @@ err_rdma_source_connect: static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) { Error *err = NULL; - int ret, idx; + int ret; struct rdma_cm_id *listen_id; char ip[40] = "unknown"; struct rdma_addrinfo *res, *e; char port_str[16]; int reuse = 1; - for (idx = 0; idx < RDMA_WRID_MAX; idx++) { - rdma->wr_data[idx].control_len = 0; - rdma->wr_data[idx].control_curr = NULL; + for (int i = 0; i < RDMA_WRID_MAX; i++) { + rdma->wr_data[i].control_len = 0; + rdma->wr_data[i].control_curr = NULL; } if (!rdma->host || !rdma->host[0]) { @@ -2723,11 +2709,9 @@ err_dest_init_create_listen_id: static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path, RDMAContext *rdma) { - int idx; - - for (idx = 0; idx < RDMA_WRID_MAX; idx++) { - rdma_return_path->wr_data[idx].control_len = 0; - rdma_return_path->wr_data[idx].control_curr = NULL; + for (int i = 0; i < RDMA_WRID_MAX; i++) { + rdma_return_path->wr_data[i].control_len = 0; + rdma_return_path->wr_data[i].control_curr = NULL; } /*the CM channel and CM id is shared*/ @@ -2781,7 +2765,7 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc, RDMAContext *rdma; int ret; ssize_t done = 0; - size_t i, len; + size_t len; RCU_READ_LOCK_GUARD(); rdma = qatomic_rcu_read(&rioc->rdmaout); @@ -2807,7 +2791,7 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc, return -1; } - for (i = 0; i < niov; i++) { + for (int i = 0; i < niov; i++) { size_t remaining = iov[i].iov_len; uint8_t * data = (void *)iov[i].iov_base; while (remaining) { @@ -2870,7 +2854,7 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc, RDMAControlHeader head; int ret; ssize_t done = 0; - size_t i, len; + size_t len; RCU_READ_LOCK_GUARD(); rdma = qatomic_rcu_read(&rioc->rdmain); @@ -2886,7 +2870,7 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc, return -1; } - for (i = 0; i < niov; i++) { + for (int i = 0; i < niov; i++) { size_t want = iov[i].iov_len; uint8_t *data = (void *)iov[i].iov_base; @@ -2946,7 +2930,6 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc, static int qemu_rdma_drain_cq(RDMAContext *rdma) { Error *err = NULL; - int ret; if (qemu_rdma_write_flush(rdma, &err) < 0) { error_report_err(err); @@ -2954,8 +2937,7 @@ static int qemu_rdma_drain_cq(RDMAContext *rdma) } while (rdma->nb_sent) { - ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); - if (ret < 0) { + if (qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL) < 0) { error_report("rdma migration: complete polling error!"); return -1; } @@ -3240,10 +3222,6 @@ static int qemu_rdma_save_page(QEMUFile *f, ram_addr_t block_offset, RDMAContext *rdma; int ret; - if (migration_in_postcopy()) { - return RAM_SAVE_CONTROL_NOT_SUPP; - } - RCU_READ_LOCK_GUARD(); rdma = qatomic_rcu_read(&rioc->rdmaout); @@ -3314,17 +3292,33 @@ err: return -1; } +int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset, + ram_addr_t offset, size_t size) +{ + if (!migrate_rdma() || migration_in_postcopy()) { + return RAM_SAVE_CONTROL_NOT_SUPP; + } + + int ret = qemu_rdma_save_page(f, block_offset, offset, size); + + if (ret != RAM_SAVE_CONTROL_DELAYED && + ret != RAM_SAVE_CONTROL_NOT_SUPP) { + if (ret < 0) { + qemu_file_set_error(f, ret); + } + } + return ret; +} + static void rdma_accept_incoming_migration(void *opaque); static void rdma_cm_poll_handler(void *opaque) { RDMAContext *rdma = opaque; - int ret; struct rdma_cm_event *cm_event; MigrationIncomingState *mis = migration_incoming_get_current(); - ret = rdma_get_cm_event(rdma->channel, &cm_event); - if (ret < 0) { + if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) { error_report("get_cm_event failed %d", errno); return; } @@ -3362,7 +3356,6 @@ static int qemu_rdma_accept(RDMAContext *rdma) struct rdma_cm_event *cm_event; struct ibv_context *verbs; int ret; - int idx; ret = rdma_get_cm_event(rdma->channel, &cm_event); if (ret < 0) { @@ -3448,10 +3441,10 @@ static int qemu_rdma_accept(RDMAContext *rdma) qemu_rdma_init_ram_blocks(rdma); - for (idx = 0; idx < RDMA_WRID_MAX; idx++) { - ret = qemu_rdma_reg_control(rdma, idx); + for (int i = 0; i < RDMA_WRID_MAX; i++) { + ret = qemu_rdma_reg_control(rdma, i); if (ret < 0) { - error_report("rdma: error registering %d control", idx); + error_report("rdma: error registering %d control", i); goto err_rdma_dest_wait; } } @@ -3522,7 +3515,7 @@ static int dest_ram_sort_func(const void *a, const void *b) * * Keep doing this until the source tells us to stop. */ -static int qemu_rdma_registration_handle(QEMUFile *f) +int rdma_registration_handle(QEMUFile *f) { RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), .type = RDMA_CONTROL_REGISTER_RESULT, @@ -3534,7 +3527,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f) }; RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT, .repeat = 1 }; - QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); + QIOChannelRDMA *rioc; Error *err = NULL; RDMAContext *rdma; RDMALocalBlocks *local; @@ -3547,10 +3540,13 @@ static int qemu_rdma_registration_handle(QEMUFile *f) void *host_addr; int ret; int idx = 0; - int count = 0; - int i = 0; + + if (!migrate_rdma()) { + return 0; + } RCU_READ_LOCK_GUARD(); + rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); rdma = qatomic_rcu_read(&rioc->rdmain); if (!rdma) { @@ -3563,7 +3559,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f) local = &rdma->local_ram_blocks; do { - trace_qemu_rdma_registration_handle_wait(); + trace_rdma_registration_handle_wait(); ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE, &err); @@ -3583,9 +3579,9 @@ static int qemu_rdma_registration_handle(QEMUFile *f) comp = (RDMACompress *) rdma->wr_data[idx].control_curr; network_to_compress(comp); - trace_qemu_rdma_registration_handle_compress(comp->length, - comp->block_idx, - comp->offset); + trace_rdma_registration_handle_compress(comp->length, + comp->block_idx, + comp->offset); if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) { error_report("rdma: 'compress' bad block index %u (vs %d)", (unsigned int)comp->block_idx, @@ -3601,11 +3597,11 @@ static int qemu_rdma_registration_handle(QEMUFile *f) break; case RDMA_CONTROL_REGISTER_FINISHED: - trace_qemu_rdma_registration_handle_finished(); + trace_rdma_registration_handle_finished(); return 0; case RDMA_CONTROL_RAM_BLOCKS_REQUEST: - trace_qemu_rdma_registration_handle_ram_blocks(); + trace_rdma_registration_handle_ram_blocks(); /* Sort our local RAM Block list so it's the same as the source, * we can do this since we've filled in a src_index in the list @@ -3614,7 +3610,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f) qsort(rdma->local_ram_blocks.block, rdma->local_ram_blocks.nb_blocks, sizeof(RDMALocalBlock), dest_ram_sort_func); - for (i = 0; i < local->nb_blocks; i++) { + for (int i = 0; i < local->nb_blocks; i++) { local->block[i].index = i; } @@ -3632,7 +3628,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f) * Both sides use the "remote" structure to communicate and update * their "local" descriptions with what was sent. */ - for (i = 0; i < local->nb_blocks; i++) { + for (int i = 0; i < local->nb_blocks; i++) { rdma->dest_blocks[i].remote_host_addr = (uintptr_t)(local->block[i].local_host_addr); @@ -3644,7 +3640,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f) rdma->dest_blocks[i].length = local->block[i].length; dest_block_to_network(&rdma->dest_blocks[i]); - trace_qemu_rdma_registration_handle_ram_blocks_loop( + trace_rdma_registration_handle_ram_blocks_loop( local->block[i].block_name, local->block[i].offset, local->block[i].length, @@ -3667,12 +3663,12 @@ static int qemu_rdma_registration_handle(QEMUFile *f) break; case RDMA_CONTROL_REGISTER_REQUEST: - trace_qemu_rdma_registration_handle_register(head.repeat); + trace_rdma_registration_handle_register(head.repeat); reg_resp.repeat = head.repeat; registers = (RDMARegister *) rdma->wr_data[idx].control_curr; - for (count = 0; count < head.repeat; count++) { + for (int count = 0; count < head.repeat; count++) { uint64_t chunk; uint8_t *chunk_start, *chunk_end; @@ -3681,7 +3677,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f) reg_result = &results[count]; - trace_qemu_rdma_registration_handle_register_loop(count, + trace_rdma_registration_handle_register_loop(count, reg->current_index, reg->key.current_addr, reg->chunks); if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) { @@ -3729,8 +3725,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f) reg_result->host_addr = (uintptr_t)block->local_host_addr; - trace_qemu_rdma_registration_handle_register_rkey( - reg_result->rkey); + trace_rdma_registration_handle_register_rkey(reg_result->rkey); result_to_network(reg_result); } @@ -3744,15 +3739,15 @@ static int qemu_rdma_registration_handle(QEMUFile *f) } break; case RDMA_CONTROL_UNREGISTER_REQUEST: - trace_qemu_rdma_registration_handle_unregister(head.repeat); + trace_rdma_registration_handle_unregister(head.repeat); unreg_resp.repeat = head.repeat; registers = (RDMARegister *) rdma->wr_data[idx].control_curr; - for (count = 0; count < head.repeat; count++) { + for (int count = 0; count < head.repeat; count++) { reg = ®isters[count]; network_to_register(reg); - trace_qemu_rdma_registration_handle_unregister_loop(count, + trace_rdma_registration_handle_unregister_loop(count, reg->current_index, reg->key.chunk); block = &(rdma->local_ram_blocks.block[reg->current_index]); @@ -3768,8 +3763,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f) rdma->total_registrations--; - trace_qemu_rdma_registration_handle_unregister_success( - reg->key.chunk); + trace_rdma_registration_handle_unregister_success(reg->key.chunk); } ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp, &err); @@ -3794,22 +3788,23 @@ err: } /* Destination: - * Called via a ram_control_load_hook during the initial RAM load section which - * lists the RAMBlocks by name. This lets us know the order of the RAMBlocks - * on the source. - * We've already built our local RAMBlock list, but not yet sent the list to - * the source. + * Called during the initial RAM load section which lists the + * RAMBlocks by name. This lets us know the order of the RAMBlocks on + * the source. We've already built our local RAMBlock list, but not + * yet sent the list to the source. */ -static int -rdma_block_notification_handle(QEMUFile *f, const char *name) +int rdma_block_notification_handle(QEMUFile *f, const char *name) { - RDMAContext *rdma; - QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); int curr; int found = -1; + if (!migrate_rdma()) { + return 0; + } + RCU_READ_LOCK_GUARD(); - rdma = qatomic_rcu_read(&rioc->rdmain); + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); + RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmain); if (!rdma) { return -1; @@ -3835,33 +3830,15 @@ rdma_block_notification_handle(QEMUFile *f, const char *name) return 0; } -static int rdma_load_hook(QEMUFile *f, uint64_t flags, void *data) -{ - switch (flags) { - case RAM_CONTROL_BLOCK_REG: - return rdma_block_notification_handle(f, data); - - case RAM_CONTROL_HOOK: - return qemu_rdma_registration_handle(f); - - default: - /* Shouldn't be called with any other values */ - abort(); - } -} - -static int qemu_rdma_registration_start(QEMUFile *f, - uint64_t flags, void *data) +int rdma_registration_start(QEMUFile *f, uint64_t flags) { - QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); - RDMAContext *rdma; - - if (migration_in_postcopy()) { + if (!migrate_rdma() || migration_in_postcopy()) { return 0; } + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); RCU_READ_LOCK_GUARD(); - rdma = qatomic_rcu_read(&rioc->rdmaout); + RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmaout); if (!rdma) { return -1; } @@ -3870,7 +3847,7 @@ static int qemu_rdma_registration_start(QEMUFile *f, return -1; } - trace_qemu_rdma_registration_start(flags); + trace_rdma_registration_start(flags); qemu_put_be64(f, RAM_SAVE_FLAG_HOOK); qemu_fflush(f); @@ -3881,20 +3858,20 @@ static int qemu_rdma_registration_start(QEMUFile *f, * Inform dest that dynamic registrations are done for now. * First, flush writes, if any. */ -static int qemu_rdma_registration_stop(QEMUFile *f, - uint64_t flags, void *data) +int rdma_registration_stop(QEMUFile *f, uint64_t flags) { - QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); + QIOChannelRDMA *rioc; Error *err = NULL; RDMAContext *rdma; RDMAControlHeader head = { .len = 0, .repeat = 1 }; int ret; - if (migration_in_postcopy()) { + if (!migrate_rdma() || migration_in_postcopy()) { return 0; } RCU_READ_LOCK_GUARD(); + rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); rdma = qatomic_rcu_read(&rioc->rdmaout); if (!rdma) { return -1; @@ -3914,10 +3891,10 @@ static int qemu_rdma_registration_stop(QEMUFile *f, if (flags == RAM_CONTROL_SETUP) { RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT }; RDMALocalBlocks *local = &rdma->local_ram_blocks; - int reg_result_idx, i, nb_dest_blocks; + int reg_result_idx, nb_dest_blocks; head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST; - trace_qemu_rdma_registration_stop_ram(); + trace_rdma_registration_stop_ram(); /* * Make sure that we parallelize the pinning on both sides. @@ -3962,7 +3939,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, qemu_rdma_move_header(rdma, reg_result_idx, &resp); memcpy(rdma->dest_blocks, rdma->wr_data[reg_result_idx].control_curr, resp.len); - for (i = 0; i < nb_dest_blocks; i++) { + for (int i = 0; i < nb_dest_blocks; i++) { network_to_dest_block(&rdma->dest_blocks[i]); /* We require that the blocks are in the same order */ @@ -3981,7 +3958,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, } } - trace_qemu_rdma_registration_stop(flags); + trace_rdma_registration_stop(flags); head.type = RDMA_CONTROL_REGISTER_FINISHED; ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL, &err); @@ -3997,17 +3974,6 @@ err: return -1; } -static const QEMUFileHooks rdma_read_hooks = { - .hook_ram_load = rdma_load_hook, -}; - -static const QEMUFileHooks rdma_write_hooks = { - .before_ram_iterate = qemu_rdma_registration_start, - .after_ram_iterate = qemu_rdma_registration_stop, - .save_page = qemu_rdma_save_page, -}; - - static void qio_channel_rdma_finalize(Object *obj) { QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj); @@ -4059,7 +4025,6 @@ static QEMUFile *rdma_new_input(RDMAContext *rdma) rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc)); rioc->rdmain = rdma; rioc->rdmaout = rdma->return_path; - qemu_file_set_hooks(rioc->file, &rdma_read_hooks); return rioc->file; } @@ -4071,7 +4036,6 @@ static QEMUFile *rdma_new_output(RDMAContext *rdma) rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc)); rioc->rdmaout = rdma; rioc->rdmain = rdma->return_path; - qemu_file_set_hooks(rioc->file, &rdma_write_hooks); return rioc->file; } @@ -4079,14 +4043,11 @@ static QEMUFile *rdma_new_output(RDMAContext *rdma) static void rdma_accept_incoming_migration(void *opaque) { RDMAContext *rdma = opaque; - int ret; QEMUFile *f; Error *local_err = NULL; trace_qemu_rdma_accept_incoming_migration(); - ret = qemu_rdma_accept(rdma); - - if (ret < 0) { + if (qemu_rdma_accept(rdma) < 0) { error_report("RDMA ERROR: Migration initialization failed"); return; } @@ -4113,6 +4074,7 @@ static void rdma_accept_incoming_migration(void *opaque) void rdma_start_incoming_migration(const char *host_port, Error **errp) { + MigrationState *s = migrate_get_current(); int ret; RDMAContext *rdma; @@ -4144,7 +4106,7 @@ void rdma_start_incoming_migration(const char *host_port, Error **errp) } trace_rdma_start_incoming_migration_after_rdma_listen(); - + s->rdma_migration = true; qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, NULL, (void *)(intptr_t)rdma); return; @@ -4220,6 +4182,7 @@ void rdma_start_outgoing_migration(void *opaque, trace_rdma_start_outgoing_migration_after_rdma_connect(); s->to_dst_file = rdma_new_output(rdma); + s->rdma_migration = true; migrate_fd_connect(s, NULL); return; return_path_err: diff --git a/migration/rdma.h b/migration/rdma.h index de2ba09..30b15b4 100644 --- a/migration/rdma.h +++ b/migration/rdma.h @@ -17,9 +17,51 @@ #ifndef QEMU_MIGRATION_RDMA_H #define QEMU_MIGRATION_RDMA_H +#include "exec/memory.h" + void rdma_start_outgoing_migration(void *opaque, const char *host_port, Error **errp); void rdma_start_incoming_migration(const char *host_port, Error **errp); +/* + * Constants used by rdma return codes + */ +#define RAM_CONTROL_SETUP 0 +#define RAM_CONTROL_ROUND 1 +#define RAM_CONTROL_FINISH 3 + +/* + * Whenever this is found in the data stream, the flags + * will be passed to rdma functions in the incoming-migration + * side. + */ +#define RAM_SAVE_FLAG_HOOK 0x80 + +#define RAM_SAVE_CONTROL_NOT_SUPP -1000 +#define RAM_SAVE_CONTROL_DELAYED -2000 + +#ifdef CONFIG_RDMA +int rdma_registration_handle(QEMUFile *f); +int rdma_registration_start(QEMUFile *f, uint64_t flags); +int rdma_registration_stop(QEMUFile *f, uint64_t flags); +int rdma_block_notification_handle(QEMUFile *f, const char *name); +int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset, + ram_addr_t offset, size_t size); +#else +static inline +int rdma_registration_handle(QEMUFile *f) { return 0; } +static inline +int rdma_registration_start(QEMUFile *f, uint64_t flags) { return 0; } +static inline +int rdma_registration_stop(QEMUFile *f, uint64_t flags) { return 0; } +static inline +int rdma_block_notification_handle(QEMUFile *f, const char *name) { return 0; } +static inline +int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset, + ram_addr_t offset, size_t size) +{ + return RAM_SAVE_CONTROL_NOT_SUPP; +} +#endif #endif diff --git a/migration/savevm.c b/migration/savevm.c index 497ce02..8622f22 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1217,13 +1217,27 @@ void qemu_savevm_non_migratable_list(strList **reasons) void qemu_savevm_state_header(QEMUFile *f) { + MigrationState *s = migrate_get_current(); + + s->vmdesc = json_writer_new(false); + trace_savevm_state_header(); qemu_put_be32(f, QEMU_VM_FILE_MAGIC); qemu_put_be32(f, QEMU_VM_FILE_VERSION); - if (migrate_get_current()->send_configuration) { + if (s->send_configuration) { qemu_put_byte(f, QEMU_VM_CONFIGURATION); - vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); + + /* + * This starts the main json object and is paired with the + * json_writer_end_object in + * qemu_savevm_state_complete_precopy_non_iterable + */ + json_writer_start_object(s->vmdesc, NULL); + + json_writer_start_object(s->vmdesc, "configuration"); + vmstate_save_state(f, &vmstate_configuration, &savevm_state, s->vmdesc); + json_writer_end_object(s->vmdesc); } } @@ -1272,8 +1286,6 @@ void qemu_savevm_state_setup(QEMUFile *f) Error *local_err = NULL; int ret; - ms->vmdesc = json_writer_new(false); - json_writer_start_object(ms->vmdesc, NULL); json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size()); json_writer_start_array(ms->vmdesc, "devices"); @@ -1660,10 +1672,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) } ms->to_dst_file = f; - qemu_mutex_unlock_iothread(); qemu_savevm_state_header(f); qemu_savevm_state_setup(f); - qemu_mutex_lock_iothread(); while (qemu_file_get_error(f) == 0) { if (qemu_savevm_state_iterate(f, false) > 0) { diff --git a/migration/trace-events b/migration/trace-events index ee9c8f4..fa9486d 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -125,6 +125,7 @@ postcopy_preempt_reset_channel(void) "" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %u" +multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p" multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" @@ -144,7 +145,7 @@ multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" multifd_tls_outgoing_handshake_complete(void *ioc) "ioc=%p" -multifd_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname, void *err) "ioc=%p ioctype=%s hostname=%s err=%p" +multifd_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname) "ioc=%p ioctype=%s hostname=%s" # migration.c await_return_path_close_on_source_close(void) "" @@ -186,7 +187,7 @@ source_return_path_thread_shut(uint32_t val) "0x%x" source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32 source_return_path_thread_switchover_acked(void) "" migration_thread_low_pending(uint64_t pending) "%" PRIu64 -migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " max_size %" PRId64 +migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64 process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" process_incoming_migration_co_postcopy_end_main(void) "" postcopy_preempt_enabled(bool value) "%d" @@ -231,20 +232,6 @@ qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.." qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p" qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s" qemu_rdma_advise_mr(const char *name, uint32_t len, uint64_t addr, const char *res) "Try to advise block %s prefetch at %" PRIu32 "@0x%" PRIx64 ": %s" -qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64 -qemu_rdma_registration_handle_finished(void) "" -qemu_rdma_registration_handle_ram_blocks(void) "" -qemu_rdma_registration_handle_ram_blocks_loop(const char *name, uint64_t offset, uint64_t length, void *local_host_addr, unsigned int src_index) "%s: @0x%" PRIx64 "/%" PRIu64 " host:@%p src_index: %u" -qemu_rdma_registration_handle_register(int requests) "%d requests" -qemu_rdma_registration_handle_register_loop(int req, int index, uint64_t addr, uint64_t chunks) "Registration request (%d): index %d, current_addr %" PRIu64 " chunks: %" PRIu64 -qemu_rdma_registration_handle_register_rkey(int rkey) "0x%x" -qemu_rdma_registration_handle_unregister(int requests) "%d requests" -qemu_rdma_registration_handle_unregister_loop(int count, int index, uint64_t chunk) "Unregistration request (%d): index %d, chunk %" PRIu64 -qemu_rdma_registration_handle_unregister_success(uint64_t chunk) "%" PRIu64 -qemu_rdma_registration_handle_wait(void) "" -qemu_rdma_registration_start(uint64_t flags) "%" PRIu64 -qemu_rdma_registration_stop(uint64_t flags) "%" PRIu64 -qemu_rdma_registration_stop_ram(void) "" qemu_rdma_resolve_host_trying(const char *host, const char *ip) "Trying %s => %s" qemu_rdma_signal_unregister_append(uint64_t chunk, int pos) "Appending unregister chunk %" PRIu64 " at position %d" qemu_rdma_signal_unregister_already(uint64_t chunk) "Unregister chunk %" PRIu64 " already in queue" @@ -263,6 +250,20 @@ qemu_rdma_write_one_zero(uint64_t chunk, int len, int index, int64_t offset) "En rdma_add_block(const char *block_name, int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: '%s':%d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" rdma_block_notification_handle(const char *name, int index) "%s at %d" rdma_delete_block(void *block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %p, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" +rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64 +rdma_registration_handle_finished(void) "" +rdma_registration_handle_ram_blocks(void) "" +rdma_registration_handle_ram_blocks_loop(const char *name, uint64_t offset, uint64_t length, void *local_host_addr, unsigned int src_index) "%s: @0x%" PRIx64 "/%" PRIu64 " host:@%p src_index: %u" +rdma_registration_handle_register(int requests) "%d requests" +rdma_registration_handle_register_loop(int req, int index, uint64_t addr, uint64_t chunks) "Registration request (%d): index %d, current_addr %" PRIu64 " chunks: %" PRIu64 +rdma_registration_handle_register_rkey(int rkey) "0x%x" +rdma_registration_handle_unregister(int requests) "%d requests" +rdma_registration_handle_unregister_loop(int count, int index, uint64_t chunk) "Unregistration request (%d): index %d, chunk %" PRIu64 +rdma_registration_handle_unregister_success(uint64_t chunk) "%" PRIu64 +rdma_registration_handle_wait(void) "" +rdma_registration_start(uint64_t flags) "%" PRIu64 +rdma_registration_stop(uint64_t flags) "%" PRIu64 +rdma_registration_stop_ram(void) "" rdma_start_incoming_migration(void) "" rdma_start_incoming_migration_after_dest_init(void) "" rdma_start_incoming_migration_after_rdma_listen(void) "" diff --git a/qapi/migration.json b/qapi/migration.json index d7dfaa5..db3df12 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -73,7 +73,7 @@ { 'struct': 'MigrationStats', 'data': {'transferred': 'int', 'remaining': 'int', 'total': 'int' , 'duplicate': 'int', - 'skipped': { 'type': 'int', 'features': ['deprecated'] }, + 'skipped': { 'type': 'int', 'features': [ 'deprecated' ] }, 'normal': 'int', 'normal-bytes': 'int', 'dirty-pages-rate': 'int', 'mbps': 'number', 'dirty-sync-count': 'int', @@ -440,10 +440,9 @@ # compress and xbzrle are both on, compress only takes effect in # the ram bulk stage, after that, it will be disabled and only # xbzrle takes effect, this can help to minimize migration -# traffic. The feature is disabled by default. (since 2.4 ) +# traffic. The feature is disabled by default. (since 2.4) # -# @events: generate events for each migration state change (since 2.4 -# ) +# @events: generate events for each migration state change (since 2.4) # # @auto-converge: If enabled, QEMU will automatically throttle down # the guest to speed up convergence of RAM migration. (since 1.6) @@ -758,6 +757,16 @@ # @max-bandwidth: to set maximum speed for migration. maximum speed # in bytes per second. (Since 2.8) # +# @avail-switchover-bandwidth: to set the available bandwidth that +# migration can use during switchover phase. NOTE! This does not +# limit the bandwidth during switchover, but only for calculations when +# making decisions to switchover. By default, this value is zero, +# which means QEMU will estimate the bandwidth automatically. This can +# be set when the estimated value is not accurate, while the user is +# able to guarantee such bandwidth is available when switching over. +# When specified correctly, this can make the switchover decision much +# more accurate. (Since 8.2) +# # @downtime-limit: set maximum tolerated downtime for migration. # maximum downtime in milliseconds (Since 2.8) # @@ -839,7 +848,7 @@ 'cpu-throttle-initial', 'cpu-throttle-increment', 'cpu-throttle-tailslow', 'tls-creds', 'tls-hostname', 'tls-authz', 'max-bandwidth', - 'downtime-limit', + 'avail-switchover-bandwidth', 'downtime-limit', { 'name': 'x-checkpoint-delay', 'features': [ 'unstable' ] }, 'block-incremental', 'multifd-channels', @@ -924,6 +933,16 @@ # @max-bandwidth: to set maximum speed for migration. maximum speed # in bytes per second. (Since 2.8) # +# @avail-switchover-bandwidth: to set the available bandwidth that +# migration can use during switchover phase. NOTE! This does not +# limit the bandwidth during switchover, but only for calculations when +# making decisions to switchover. By default, this value is zero, +# which means QEMU will estimate the bandwidth automatically. This can +# be set when the estimated value is not accurate, while the user is +# able to guarantee such bandwidth is available when switching over. +# When specified correctly, this can make the switchover decision much +# more accurate. (Since 8.2) +# # @downtime-limit: set maximum tolerated downtime for migration. # maximum downtime in milliseconds (Since 2.8) # @@ -1017,6 +1036,7 @@ '*tls-hostname': 'StrOrNull', '*tls-authz': 'StrOrNull', '*max-bandwidth': 'size', + '*avail-switchover-bandwidth': 'size', '*downtime-limit': 'uint64', '*x-checkpoint-delay': { 'type': 'uint32', 'features': [ 'unstable' ] }, @@ -1127,6 +1147,16 @@ # @max-bandwidth: to set maximum speed for migration. maximum speed # in bytes per second. (Since 2.8) # +# @avail-switchover-bandwidth: to set the available bandwidth that +# migration can use during switchover phase. NOTE! This does not +# limit the bandwidth during switchover, but only for calculations when +# making decisions to switchover. By default, this value is zero, +# which means QEMU will estimate the bandwidth automatically. This can +# be set when the estimated value is not accurate, while the user is +# able to guarantee such bandwidth is available when switching over. +# When specified correctly, this can make the switchover decision much +# more accurate. (Since 8.2) +# # @downtime-limit: set maximum tolerated downtime for migration. # maximum downtime in milliseconds (Since 2.8) # @@ -1217,6 +1247,7 @@ '*tls-hostname': 'str', '*tls-authz': 'str', '*max-bandwidth': 'size', + '*avail-switchover-bandwidth': 'size', '*downtime-limit': 'uint64', '*x-checkpoint-delay': { 'type': 'uint32', 'features': [ 'unstable' ] }, diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py index 0824245..de506cb 100755 --- a/scripts/analyze-migration.py +++ b/scripts/analyze-migration.py @@ -38,13 +38,13 @@ class MigrationFile(object): self.file = open(self.filename, "rb") def read64(self): - return int.from_bytes(self.file.read(8), byteorder='big', signed=True) + return int.from_bytes(self.file.read(8), byteorder='big', signed=False) def read32(self): - return int.from_bytes(self.file.read(4), byteorder='big', signed=True) + return int.from_bytes(self.file.read(4), byteorder='big', signed=False) def read16(self): - return int.from_bytes(self.file.read(2), byteorder='big', signed=True) + return int.from_bytes(self.file.read(2), byteorder='big', signed=False) def read8(self): return int.from_bytes(self.file.read(1), byteorder='big', signed=True) @@ -123,6 +123,7 @@ class RamSection(object): self.TARGET_PAGE_SIZE = ramargs['page_size'] self.dump_memory = ramargs['dump_memory'] self.write_memory = ramargs['write_memory'] + self.ignore_shared = ramargs['ignore_shared'] self.sizeinfo = collections.OrderedDict() self.data = collections.OrderedDict() self.data['section sizes'] = self.sizeinfo @@ -169,6 +170,8 @@ class RamSection(object): f.truncate(0) f.truncate(len) self.files[self.name] = f + if self.ignore_shared: + mr_addr = self.file.read64() flags &= ~self.RAM_SAVE_FLAG_MEM_SIZE if flags & self.RAM_SAVE_FLAG_COMPRESS: @@ -261,12 +264,41 @@ class HTABSection(object): class ConfigurationSection(object): - def __init__(self, file): + def __init__(self, file, desc): self.file = file + self.desc = desc + self.caps = [] + + def parse_capabilities(self, vmsd_caps): + if not vmsd_caps: + return + + ncaps = vmsd_caps.data['caps_count'].data + self.caps = vmsd_caps.data['capabilities'] + + if type(self.caps) != list: + self.caps = [self.caps] + + if len(self.caps) != ncaps: + raise Exception("Number of capabilities doesn't match " + "caps_count field") + + def has_capability(self, cap): + return any([str(c) == cap for c in self.caps]) def read(self): - name_len = self.file.read32() - name = self.file.readstr(len = name_len) + if self.desc: + version_id = self.desc['version'] + section = VMSDSection(self.file, version_id, self.desc, + 'configuration') + section.read() + self.parse_capabilities( + section.data.get("configuration/capabilities")) + else: + # backward compatibility for older streams that don't have + # the configuration section in the json + name_len = self.file.read32() + name = self.file.readstr(len = name_len) class VMSDFieldGeneric(object): def __init__(self, desc, file): @@ -288,6 +320,23 @@ class VMSDFieldGeneric(object): self.data = self.file.readvar(size) return self.data +class VMSDFieldCap(object): + def __init__(self, desc, file): + self.file = file + self.desc = desc + self.data = "" + + def __repr__(self): + return self.data + + def __str__(self): + return self.data + + def read(self): + len = self.file.read8() + self.data = self.file.readstr(len) + + class VMSDFieldInt(VMSDFieldGeneric): def __init__(self, desc, file): super(VMSDFieldInt, self).__init__(desc, file) @@ -462,6 +511,7 @@ vmsd_field_readers = { "unused_buffer" : VMSDFieldGeneric, "bitmap" : VMSDFieldGeneric, "struct" : VMSDFieldStruct, + "capability": VMSDFieldCap, "unknown" : VMSDFieldGeneric, } @@ -525,6 +575,7 @@ class MigrationDump(object): ramargs['page_size'] = self.vmsd_desc['page_size'] ramargs['dump_memory'] = dump_memory ramargs['write_memory'] = write_memory + ramargs['ignore_shared'] = False self.section_classes[('ram',0)][1] = ramargs while True: @@ -532,8 +583,10 @@ class MigrationDump(object): if section_type == self.QEMU_VM_EOF: break elif section_type == self.QEMU_VM_CONFIGURATION: - section = ConfigurationSection(file) + config_desc = self.vmsd_desc.get('configuration') + section = ConfigurationSection(file, config_desc) section.read() + ramargs['ignore_shared'] = section.has_capability('x-ignore-shared') elif section_type == self.QEMU_VM_SECTION_START or section_type == self.QEMU_VM_SECTION_FULL: section_id = file.read32() name = file.readstr() diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index bda8899..7ca4b77 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -168,6 +168,7 @@ meson_options_help() { printf "%s\n" ' rbd Ceph block device driver' printf "%s\n" ' rdma Enable RDMA-based migration' printf "%s\n" ' replication replication support' + printf "%s\n" ' rutabaga-gfx rutabaga_gfx support' printf "%s\n" ' sdl SDL user interface' printf "%s\n" ' sdl-image SDL Image support for icons' printf "%s\n" ' seccomp seccomp support' @@ -446,6 +447,8 @@ _meson_option_parse() { --disable-replication) printf "%s" -Dreplication=disabled ;; --enable-rng-none) printf "%s" -Drng_none=true ;; --disable-rng-none) printf "%s" -Drng_none=false ;; + --enable-rutabaga-gfx) printf "%s" -Drutabaga_gfx=enabled ;; + --disable-rutabaga-gfx) printf "%s" -Drutabaga_gfx=disabled ;; --enable-safe-stack) printf "%s" -Dsafe_stack=true ;; --disable-safe-stack) printf "%s" -Dsafe_stack=false ;; --enable-sanitizers) printf "%s" -Dsanitizers=true ;; diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index 74f4e41..1b8005a 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -86,6 +86,9 @@ static const QDevAlias qdev_alias_table[] = { { "virtio-gpu-pci", "virtio-gpu", QEMU_ARCH_VIRTIO_PCI }, { "virtio-gpu-gl-device", "virtio-gpu-gl", QEMU_ARCH_VIRTIO_MMIO }, { "virtio-gpu-gl-pci", "virtio-gpu-gl", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-gpu-rutabaga-device", "virtio-gpu-rutabaga", + QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-gpu-rutabaga-pci", "virtio-gpu-rutabaga", QEMU_ARCH_VIRTIO_PCI }, { "virtio-input-host-device", "virtio-input-host", QEMU_ARCH_VIRTIO_MMIO }, { "virtio-input-host-ccw", "virtio-input-host", QEMU_ARCH_VIRTIO_CCW }, { "virtio-input-host-pci", "virtio-input-host", QEMU_ARCH_VIRTIO_PCI }, diff --git a/system/vl.c b/system/vl.c index ba83040..3100ac0 100644 --- a/system/vl.c +++ b/system/vl.c @@ -216,6 +216,7 @@ static struct { { .driver = "ati-vga", .flag = &default_vga }, { .driver = "vhost-user-vga", .flag = &default_vga }, { .driver = "virtio-vga-gl", .flag = &default_vga }, + { .driver = "virtio-vga-rutabaga", .flag = &default_vga }, }; static QemuOptsList qemu_rtc_opts = { diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build index 66795cf..d6022eb 100644 --- a/tests/qtest/meson.build +++ b/tests/qtest/meson.build @@ -357,6 +357,8 @@ foreach dir : target_dirs test_deps += [qsd] endif + qtest_env.set('PYTHON', python.full_path()) + foreach test : target_qtests # Executables are shared across targets, declare them only the first time we # encounter them diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 8eb2053..e1c1105 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -66,6 +66,12 @@ static bool got_dst_resume; */ #define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */ +#define ANALYZE_SCRIPT "scripts/analyze-migration.py" + +#define QEMU_VM_FILE_MAGIC 0x5145564d +#define FILE_TEST_FILENAME "migfile" +#define FILE_TEST_OFFSET 0x1000 + #if defined(__linux__) #include <sys/syscall.h> #include <sys/vfs.h> @@ -882,6 +888,7 @@ static void test_migrate_end(QTestState *from, QTestState *to, bool test_dest) cleanup("migsocket"); cleanup("src_serial"); cleanup("dest_serial"); + cleanup(FILE_TEST_FILENAME); } #ifdef CONFIG_GNUTLS @@ -1501,6 +1508,61 @@ static void test_baddest(void) test_migrate_end(from, to, false); } +#ifndef _WIN32 +static void test_analyze_script(void) +{ + MigrateStart args = { + .opts_source = "-uuid 11111111-1111-1111-1111-111111111111", + }; + QTestState *from, *to; + g_autofree char *uri = NULL; + g_autofree char *file = NULL; + int pid, wstatus; + const char *python = g_getenv("PYTHON"); + + if (!python) { + g_test_skip("PYTHON variable not set"); + return; + } + + /* dummy url */ + if (test_migrate_start(&from, &to, "tcp:127.0.0.1:0", &args)) { + return; + } + + /* + * Setting these two capabilities causes the "configuration" + * vmstate to include subsections for them. The script needs to + * parse those subsections properly. + */ + migrate_set_capability(from, "validate-uuid", true); + migrate_set_capability(from, "x-ignore-shared", true); + + file = g_strdup_printf("%s/migfile", tmpfs); + uri = g_strdup_printf("exec:cat > %s", file); + + migrate_ensure_converge(from); + migrate_qmp(from, uri, "{}"); + wait_for_migration_complete(from); + + pid = fork(); + if (!pid) { + close(1); + open("/dev/null", O_WRONLY); + execl(python, python, ANALYZE_SCRIPT, "-f", file, NULL); + g_assert_not_reached(); + } + + g_assert(waitpid(pid, &wstatus, 0) == pid); + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != 0) { + g_test_message("Failed to analyze the migration stream"); + g_test_fail(); + } + test_migrate_end(from, to, false); + cleanup("migfile"); +} +#endif + static void test_precopy_common(MigrateCommon *args) { QTestState *from, *to; @@ -1610,6 +1672,70 @@ finish: test_migrate_end(from, to, args->result == MIG_TEST_SUCCEED); } +static void test_file_common(MigrateCommon *args, bool stop_src) +{ + QTestState *from, *to; + void *data_hook = NULL; + g_autofree char *connect_uri = g_strdup(args->connect_uri); + + if (test_migrate_start(&from, &to, args->listen_uri, &args->start)) { + return; + } + + /* + * File migration is never live. We can keep the source VM running + * during migration, but the destination will not be running + * concurrently. + */ + g_assert_false(args->live); + + if (args->start_hook) { + data_hook = args->start_hook(from, to); + } + + migrate_ensure_converge(from); + wait_for_serial("src_serial"); + + if (stop_src) { + qtest_qmp_assert_success(from, "{ 'execute' : 'stop'}"); + if (!got_src_stop) { + qtest_qmp_eventwait(from, "STOP"); + } + } + + if (args->result == MIG_TEST_QMP_ERROR) { + migrate_qmp_fail(from, connect_uri, "{}"); + goto finish; + } + + migrate_qmp(from, connect_uri, "{}"); + wait_for_migration_complete(from); + + /* + * We need to wait for the source to finish before starting the + * destination. + */ + migrate_incoming_qmp(to, connect_uri, "{}"); + wait_for_migration_complete(to); + + if (stop_src) { + qtest_qmp_assert_success(to, "{ 'execute' : 'cont'}"); + } + + if (!got_dst_resume) { + qtest_qmp_eventwait(to, "RESUME"); + } + + wait_for_serial("dest_serial"); + +finish: + if (args->finish_hook) { + args->finish_hook(from, to, data_hook); + } + + test_migrate_end(from, to, args->result == MIG_TEST_SUCCEED); +} + static void test_precopy_unix_plain(void) { g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); @@ -1805,6 +1931,76 @@ static void test_precopy_unix_compress_nowait(void) test_precopy_common(&args); } +static void test_precopy_file(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + }; + + test_file_common(&args, true); +} + +static void file_offset_finish_hook(QTestState *from, QTestState *to, + void *opaque) +{ +#if defined(__linux__) + g_autofree char *path = g_strdup_printf("%s/%s", tmpfs, FILE_TEST_FILENAME); + size_t size = FILE_TEST_OFFSET + sizeof(QEMU_VM_FILE_MAGIC); + uintptr_t *addr, *p; + int fd; + + fd = open(path, O_RDONLY); + g_assert(fd != -1); + addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + g_assert(addr != MAP_FAILED); + + /* + * Ensure the skipped offset contains zeros and the migration + * stream starts at the right place. + */ + p = addr; + while (p < addr + FILE_TEST_OFFSET / sizeof(uintptr_t)) { + g_assert(*p == 0); + p++; + } + g_assert_cmpint(cpu_to_be64(*p) >> 32, ==, QEMU_VM_FILE_MAGIC); + + munmap(addr, size); + close(fd); +#endif +} + +static void test_precopy_file_offset(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s,offset=%d", tmpfs, + FILE_TEST_FILENAME, + FILE_TEST_OFFSET); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .finish_hook = file_offset_finish_hook, + }; + + test_file_common(&args, false); +} + +static void test_precopy_file_offset_bad(void) +{ + /* using a value not supported by qemu_strtosz() */ + g_autofree char *uri = g_strdup_printf("file:%s/%s,offset=0x20M", + tmpfs, FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .result = MIG_TEST_QMP_ERROR, + }; + + test_file_common(&args, false); +} + static void test_precopy_tcp_plain(void) { MigrateCommon args = { @@ -2837,6 +3033,9 @@ int main(int argc, char **argv) } qtest_add_func("/migration/bad_dest", test_baddest); +#ifndef _WIN32 + qtest_add_func("/migration/analyze-script", test_analyze_script); +#endif qtest_add_func("/migration/precopy/unix/plain", test_precopy_unix_plain); qtest_add_func("/migration/precopy/unix/xbzrle", test_precopy_unix_xbzrle); /* @@ -2849,6 +3048,14 @@ int main(int argc, char **argv) qtest_add_func("/migration/precopy/unix/compress/nowait", test_precopy_unix_compress_nowait); } + + qtest_add_func("/migration/precopy/file", + test_precopy_file); + qtest_add_func("/migration/precopy/file/offset", + test_precopy_file_offset); + qtest_add_func("/migration/precopy/file/offset/bad", + test_precopy_file_offset_bad); + #ifdef CONFIG_GNUTLS qtest_add_func("/migration/precopy/unix/tls/psk", test_precopy_unix_tls_psk); |