aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilliam Henderson <william.henderson@nutanix.com>2023-09-15 16:07:01 +0100
committerGitHub <noreply@github.com>2023-09-15 16:07:01 +0100
commit190f85bf9c114bf7c981bb8908394368f84c0c04 (patch)
tree92273a811fc3a8af74a5f62cec8871f345d6999b
parent1569a37a54ecb63bd4008708c76339ccf7d06115 (diff)
downloadlibvfio-user-190f85bf9c114bf7c981bb8908394368f84c0c04.zip
libvfio-user-190f85bf9c114bf7c981bb8908394368f84c0c04.tar.gz
libvfio-user-190f85bf9c114bf7c981bb8908394368f84c0c04.tar.bz2
adapt to VFIO live migration v2 (#782)
This commit adapts the vfio-user protocol specification and the libvfio-user implementation to v2 of the VFIO live migration interface, as used in the kernel and QEMU. The differences between v1 and v2 are discussed in this email thread [1], and we slightly differ from upstream VFIO v2 in that instead of transferring data over a new FD, we use the existing UNIX socket with new commands VFIO_USER_MIG_DATA_READ/WRITE. We also don't yet use P2P states. The updated spec was submitted to qemu-devel [2]. [1] https://lore.kernel.org/all/20220130160826.32449-9-yishaih@nvidia.com/ [2] https://lore.kernel.org/all/20230718094150.110183-1-william.henderson@nutanix.com/ Signed-off-by: William Henderson <william.henderson@nutanix.com>
-rw-r--r--docs/vfio-user.rst836
-rw-r--r--include/libvfio-user.h162
-rw-r--r--include/vfio-user.h105
-rw-r--r--lib/common.h41
-rw-r--r--lib/dma.c242
-rw-r--r--lib/dma.h1
-rw-r--r--lib/libvfio-user.c506
-rw-r--r--lib/migration.c572
-rw-r--r--lib/migration.h24
-rw-r--r--lib/migration_priv.h86
-rw-r--r--lib/private.h14
-rw-r--r--samples/client.c422
-rw-r--r--samples/gpio-pci-idio-16.c54
-rw-r--r--samples/server.c207
-rw-r--r--test/mocks.c20
-rw-r--r--test/py/libvfio_user.py227
-rw-r--r--test/py/test_device_get_region_info.py46
-rw-r--r--test/py/test_device_get_region_info_zero_size.py41
-rw-r--r--test/py/test_dirty_pages.py374
-rw-r--r--test/py/test_dma_unmap.py20
-rw-r--r--test/py/test_migration.py575
-rw-r--r--test/py/test_quiesce.py21
-rw-r--r--test/py/test_request_errors.py23
-rw-r--r--test/py/test_setup_region.py24
-rw-r--r--test/unit-tests.c211
25 files changed, 2497 insertions, 2357 deletions
diff --git a/docs/vfio-user.rst b/docs/vfio-user.rst
index 3c26da5..b83b359 100644
--- a/docs/vfio-user.rst
+++ b/docs/vfio-user.rst
@@ -1,11 +1,10 @@
.. include:: <isonum.txt>
-
********************************
vfio-user Protocol Specification
********************************
--------------
-Version_ 0.9.1
+Version_ 0.9.2
--------------
.. contents:: Table of Contents
@@ -342,9 +341,9 @@ usual ``msg_size`` field in the header, not the ``argsz`` field.
In a reply, the server sets ``argsz`` field to the size needed for a full
payload size. This may be less than the requested maximum size. This may be
-larger than the requested maximum size: in that case, the payload reply header
-is returned, but the ``argsz`` field in the reply indicates the needed size,
-allowing a client to allocate a larger buffer for holding the reply before
+larger than the requested maximum size: in that case, the full payload is not
+included in the reply, but the ``argsz`` field in the reply indicates the needed
+size, allowing a client to allocate a larger buffer for holding the reply before
trying again.
In addition, during negotiation (see `Version`_), the client and server may
@@ -357,8 +356,9 @@ Protocol Specification
To distinguish from the base VFIO symbols, all vfio-user symbols are prefixed
with ``vfio_user`` or ``VFIO_USER``. In this revision, all data is in the
-little-endian format, although this may be relaxed in future revisions in cases
-where the client and server are both big-endian.
+endianness of the host system, although this may be relaxed in future
+revisions in cases where the client and server run on different hosts
+with different endianness.
Unless otherwise specified, all sizes should be presumed to be in bytes.
@@ -385,7 +385,10 @@ Name Command Request Direction
``VFIO_USER_DMA_READ`` 11 server -> client
``VFIO_USER_DMA_WRITE`` 12 server -> client
``VFIO_USER_DEVICE_RESET`` 13 client -> server
-``VFIO_USER_DIRTY_PAGES`` 14 client -> server
+``VFIO_USER_REGION_WRITE_MULTI`` 15 client -> server
+``VFIO_USER_DEVICE_FEATURE`` 16 client -> server
+``VFIO_USER_MIG_DATA_READ`` 17 client -> server
+``VFIO_USER_MIG_DATA_WRITE`` 18 client -> server
====================================== ========= =================
Header
@@ -508,34 +511,33 @@ format:
Capabilities:
-+--------------------+--------+------------------------------------------------+
-| Name | Type | Description |
-+====================+========+================================================+
-| max_msg_fds | number | Maximum number of file descriptors that can be |
-| | | received by the sender in one message. |
-| | | Optional. If not specified then the receiver |
-| | | must assume a value of ``1``. |
-+--------------------+--------+------------------------------------------------+
-| max_data_xfer_size | number | Maximum ``count`` for data transfer messages; |
-| | | see `Read and Write Operations`_. Optional, |
-| | | with a default value of 1048576 bytes. |
-+--------------------+--------+------------------------------------------------+
-| migration | object | Migration capability parameters. If missing |
-| | | then migration is not supported by the sender. |
-+--------------------+--------+------------------------------------------------+
-| twin_socket | object | Parameters for twin-socket mode, which handles |
-| | | server-to-client commands and their replies on |
-| | | a separate socket. Optional. |
-+--------------------+--------+------------------------------------------------+
-
-The migration capability contains the following name/value pairs:
-
-+--------+--------+-----------------------------------------------+
-| Name | Type | Description |
-+========+========+===============================================+
-| pgsize | number | Page size of dirty pages bitmap. The smallest |
-| | | between the client and the server is used. |
-+--------+--------+-----------------------------------------------+
++--------------------+---------+-----------------------------------------------+
+| Name | Type | Description |
++====================+=========+===============================================+
+| max_msg_fds | number | Maximum number of file descriptors that can |
+| | | be received by the sender in one message. |
+| | | Optional. If not specified then the receiver |
+| | | must assume a value of ``1``. |
++--------------------+---------+-----------------------------------------------+
+| max_data_xfer_size | number | Maximum ``count`` for data transfer messages; |
+| | | see `Read and Write Operations`_. Optional, |
+| | | with a default value of 1048576 bytes. |
++--------------------+---------+-----------------------------------------------+
+| max_dma_maps | number | Maximum number DMA map windows that can be |
+| | | valid simultaneously. Optional, with a |
+| | | value of 65535 (64k-1). |
++--------------------+---------+-----------------------------------------------+
+| pgsizes | number | Page sizes supported in DMA map operations |
+| | | or'ed together. Optional, with a default |
+| | | value of supporting only 4k pages. |
++--------------------+---------+-----------------------------------------------+
+| twin_socket | object | Parameters for twin-socket mode, which |
+| | | handles server-to-client commands and their |
+| | | replies on a separate socket. Optional. |
++--------------------+---------+-----------------------------------------------+
+| write_multiple | boolean | ``VFIO_USER_REGION_WRITE_MULTI`` messages |
+| | | are supported if the value is ``true``. |
++--------------------+---------+-----------------------------------------------+
The ``twin_socket`` capability object holds these name/value pairs:
@@ -678,56 +680,18 @@ The request payload for this message is a structure of the following format:
+--------------+--------+------------------------+
| flags | 4 | 4 |
+--------------+--------+------------------------+
-| | +-----+-----------------------+ |
-| | | Bit | Definition | |
-| | +=====+=======================+ |
-| | | 0 | get dirty page bitmap | |
-| | +-----+-----------------------+ |
-| | | 1 | unmap all regions | |
-| | +-----+-----------------------+ |
-+--------------+--------+------------------------+
| address | 8 | 8 |
+--------------+--------+------------------------+
| size | 16 | 8 |
+--------------+--------+------------------------+
* *argsz* is the maximum size of the reply payload.
-* *flags* contains the following DMA region attributes:
-
- * *get dirty page bitmap* indicates that a dirty page bitmap must be
- populated before unmapping the DMA region. The client must provide a
- `VFIO Bitmap`_ structure, explained below, immediately following this
- entry.
- * *unmap all regions* indicates to unmap all the regions previously
- mapped via `VFIO_USER_DMA_MAP`. This flag cannot be combined with
- *get dirty page bitmap* and expects *address* and *size* to be 0.
-
+* *flags* is unused in this version.
* *address* is the base DMA address of the DMA region.
* *size* is the size of the DMA region.
The address and size of the DMA region being unmapped must match exactly a
-previous mapping. The size of request message depends on whether or not the
-*get dirty page bitmap* bit is set in Flags:
-
-* If not set, the size of the total request message is: 16 + 24.
-
-* If set, the size of the total request message is: 16 + 24 + 16.
-
-.. _VFIO Bitmap:
-
-VFIO Bitmap Format
-""""""""""""""""""
-
-+--------+--------+------+
-| Name | Offset | Size |
-+========+========+======+
-| pgsize | 0 | 8 |
-+--------+--------+------+
-| size | 8 | 8 |
-+--------+--------+------+
-
-* *pgsize* is the page size for the bitmap, in bytes.
-* *size* is the size for the bitmap, in bytes, excluding the VFIO bitmap header.
+previous mapping.
Reply
^^^^^
@@ -736,14 +700,8 @@ Upon receiving a ``VFIO_USER_DMA_UNMAP`` command, if the file descriptor is
mapped then the server must release all references to that DMA region before
replying, which potentially includes in-flight DMA transactions.
-The server responds with the original DMA entry in the request. If the
-*get dirty page bitmap* bit is set in flags in the request, then
-the server also includes the `VFIO Bitmap`_ structure sent in the request,
-followed by the corresponding dirty page bitmap, where each bit represents
-one page of size *pgsize* in `VFIO Bitmap`_ .
+The server responds with the original DMA entry in the request.
-The total size of the total reply message is:
-16 + 24 + (16 + *size* in `VFIO Bitmap`_ if *get dirty page bitmap* is set).
``VFIO_USER_DEVICE_GET_INFO``
-----------------------------
@@ -959,7 +917,7 @@ VFIO region info cap sparse mmap
+----------+--------+------+
| offset | 8 | 8 |
+----------+--------+------+
-| size | 16 | 9 |
+| size | 16 | 8 |
+----------+--------+------+
| ... | | |
+----------+--------+------+
@@ -973,39 +931,6 @@ VFIO region info cap sparse mmap
The VFIO sparse mmap area is defined in ``<linux/vfio.h>`` (``struct
vfio_region_info_cap_sparse_mmap``).
-VFIO region type cap header
-"""""""""""""""""""""""""""
-
-+------------------+---------------------------+
-| Name | Value |
-+==================+===========================+
-| id | VFIO_REGION_INFO_CAP_TYPE |
-+------------------+---------------------------+
-| version | 0x1 |
-+------------------+---------------------------+
-| next | <next> |
-+------------------+---------------------------+
-| region info type | VFIO region info type |
-+------------------+---------------------------+
-
-This capability is defined when a region is specific to the device.
-
-VFIO region info type cap
-"""""""""""""""""""""""""
-
-The VFIO region info type is defined in ``<linux/vfio.h>``
-(``struct vfio_region_info_cap_type``).
-
-+---------+--------+------+
-| Name | Offset | Size |
-+=========+========+======+
-| type | 0 | 4 |
-+---------+--------+------+
-| subtype | 4 | 4 |
-+---------+--------+------+
-
-The only device-specific region type and subtype supported by vfio-user is
-``VFIO_REGION_TYPE_MIGRATION`` (3) and ``VFIO_REGION_SUBTYPE_MIGRATION`` (1).
``VFIO_USER_DEVICE_GET_REGION_IO_FDS``
--------------------------------------
@@ -1071,7 +996,7 @@ Reply
* *argsz* is the size of the region IO FD info structure plus the
total size of the sub-region array. Thus, each array entry "i" is at offset
- i * ((argsz - 16) / count). Note that currently this is 40 bytes for both IO
+ i * ((argsz - 32) / count). Note that currently this is 40 bytes for both IO
FD types, but this is not to be relied on. As elsewhere, this indicates the
full reply payload size needed.
* *flags* must be zero
@@ -1087,8 +1012,8 @@ Note that it is the client's responsibility to verify the requested values (for
example, that the requested offset does not exceed the region's bounds).
Each sub-region given in the response has one of two possible structures,
-depending whether *type* is ``VFIO_USER_IO_FD_TYPE_IOEVENTFD`` (0) or
-``VFIO_USER_IO_FD_TYPE_IOREGIONFD`` (1):
+depending whether *type* is ``VFIO_USER_IO_FD_TYPE_IOEVENTFD`` or
+``VFIO_USER_IO_FD_TYPE_IOREGIONFD``:
Sub-Region IO FD info format (ioeventfd)
""""""""""""""""""""""""""""""""""""""""
@@ -1552,290 +1477,455 @@ Reply
This command message is sent from the client to the server to reset the device.
Neither the request or reply have a payload.
-``VFIO_USER_DIRTY_PAGES``
--------------------------
+``VFIO_USER_REGION_WRITE_MULTI``
+--------------------------------
+
+This message can be used to coalesce multiple device write operations
+into a single messgage. It is only used as an optimization when the
+outgoing message queue is relatively full.
+
+Request
+^^^^^^^
+
++---------+--------+----------+
+| Name | Offset | Size |
++=========+========+==========+
+| wr_cnt | 0 | 8 |
++---------+--------+----------+
+| wrs | 8 | variable |
++---------+--------+----------+
-This command is analogous to ``VFIO_IOMMU_DIRTY_PAGES``. It is sent by the client
-to the server in order to control logging of dirty pages, usually during a live
-migration.
+* *wr_cnt* is the number of device writes coalesced in the message
+* *wrs* is an array of device writes defined below
-Dirty page tracking is optional for server implementation; clients should not
-rely on it.
+Single Device Write Format
+""""""""""""""""""""""""""
+
++--------+--------+----------+
+| Name | Offset | Size |
++========+========+==========+
+| offset | 0 | 8 |
++--------+--------+----------+
+| region | 8 | 4 |
++--------+--------+----------+
+| count | 12 | 4 |
++--------+--------+----------+
+| data | 16 | 8 |
++--------+--------+----------+
+
+* *offset* into the region being accessed.
+* *region* is the index of the region being accessed.
+* *count* is the size of the data to be transferred. This format can
+ only describe writes of 8 bytes or less.
+* *data* is the data to write.
+
+Reply
+^^^^^
+
++---------+--------+----------+
+| Name | Offset | Size |
++=========+========+==========+
+| wr_cnt | 0 | 8 |
++---------+--------+----------+
+
+* *wr_cnt* is the number of device writes completed.
+
+``VFIO_USER_DEVICE_FEATURE``
+----------------------------
+
+This command is analogous to ``VFIO_DEVICE_FEATURE``. It is used to get, set, or
+probe feature data of the device.
Request
^^^^^^^
-+-------+--------+-----------------------------------------+
-| Name | Offset | Size |
-+=======+========+=========================================+
-| argsz | 0 | 4 |
-+-------+--------+-----------------------------------------+
-| flags | 4 | 4 |
-+-------+--------+-----------------------------------------+
-| | +-----+----------------------------------------+ |
-| | | Bit | Definition | |
-| | +=====+========================================+ |
-| | | 0 | VFIO_IOMMU_DIRTY_PAGES_FLAG_START | |
-| | +-----+----------------------------------------+ |
-| | | 1 | VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP | |
-| | +-----+----------------------------------------+ |
-| | | 2 | VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP | |
-| | +-----+----------------------------------------+ |
-+-------+--------+-----------------------------------------+
-
-* *argsz* is the size of the VFIO dirty bitmap info structure for
- ``START/STOP``; and for ``GET_BITMAP``, the maximum size of the reply payload
-
-* *flags* defines the action to be performed by the server:
-
- * ``VFIO_IOMMU_DIRTY_PAGES_FLAG_START`` instructs the server to start logging
- pages it dirties. Logging continues until explicitly disabled by
- ``VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP``.
-
- * ``VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP`` instructs the server to stop logging
- dirty pages.
-
- * ``VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP`` requests the server to return
- the dirty bitmap for a specific IOVA range. The IOVA range is specified by
- a "VFIO Bitmap Range" structure, which must immediately follow this
- "VFIO Dirty Pages" structure. See `VFIO Bitmap Range Format`_.
- This operation is only valid if logging of dirty pages has been previously
- started.
-
- These flags are mutually exclusive with each other.
-
-This part of the request is analogous to VFIO's ``struct
-vfio_iommu_type1_dirty_bitmap``.
-
-.. _VFIO Bitmap Range Format:
-
-VFIO Bitmap Range Format
+The request payload for this message is a structure of the following format.
+
++-------+--------+--------------------------------+
+| Name | Offset | Size |
++=======+========+================================+
+| argsz | 0 | 4 |
++-------+--------+--------------------------------+
+| flags | 4 | 4 |
++-------+--------+--------------------------------+
+| | +---------+---------------------------+ |
+| | | Bit | Definition | |
+| | +=========+===========================+ |
+| | | 0 to 15 | Feature index | |
+| | +---------+---------------------------+ |
+| | | 16 | VFIO_DEVICE_FEATURE_GET | |
+| | +---------+---------------------------+ |
+| | | 17 | VFIO_DEVICE_FEATURE_SET | |
+| | +---------+---------------------------+ |
+| | | 18 | VFIO_DEVICE_FEATURE_PROBE | |
+| | +---------+---------------------------+ |
++-------+--------+--------------------------------+
+| data | 8 | variable |
++-------+--------+--------------------------------+
+
+* *argsz* is the maximum size of the reply payload.
+
+* *flags* defines the action to be performed by the server and upon which
+ feature:
+
+ * The feature index consists of the least significant 16 bits of the flags
+ field, and can be accessed using the ``VFIO_DEVICE_FEATURE_MASK`` bit mask.
+
+ * ``VFIO_DEVICE_FEATURE_GET`` instructs the server to get the data for the
+ given feature.
+
+ * ``VFIO_DEVICE_FEATURE_SET`` instructs the server to set the feature data to
+ that given in the ``data`` field of the payload.
+
+ * ``VFIO_DEVICE_FEATURE_PROBE`` instructs the server to probe for feature
+ support. If ``VFIO_DEVICE_FEATURE_GET`` and/or ``VFIO_DEVICE_FEATURE_SET``
+ are also set, the probe will only return success if all of the indicated
+ methods are supported.
+
+ ``VFIO_DEVICE_FEATURE_GET`` and ``VFIO_DEVICE_FEATURE_SET`` are mutually
+ exclusive, except for use with ``VFIO_DEVICE_FEATURE_PROBE``.
+
+* *data* is specific to the particular feature. It is not used for probing.
+
+This part of the request is analogous to VFIO's ``struct vfio_device_feature``.
+
+Reply
+^^^^^
+
+The reply payload must be the same as the request payload for setting or
+probing a feature. For getting a feature's data, the data is added in the data
+section and its length is added to ``argsz``.
+
+Device Features
+^^^^^^^^^^^^^^^
+
+The only device features supported by vfio-user are those related to migration,
+although this may change in the future. They are a subset of those supported in
+the VFIO implementation of the Linux kernel.
+
++----------------------------------------+---------------+
+| Name | Feature Index |
++========================================+===============+
+| VFIO_DEVICE_FEATURE_MIGRATION | 1 |
++----------------------------------------+---------------+
+| VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE | 2 |
++----------------------------------------+---------------+
+| VFIO_DEVICE_FEATURE_DMA_LOGGING_START | 6 |
++----------------------------------------+---------------+
+| VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP | 7 |
++----------------------------------------+---------------+
+| VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT | 8 |
++----------------------------------------+---------------+
+
+``VFIO_DEVICE_FEATURE_MIGRATION``
+"""""""""""""""""""""""""""""""""
+
+This feature indicates that the device can support the migration API through
+``VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE``. If ``GET`` succeeds, the ``RUNNING``
+and ``ERROR`` states are always supported. Support for additional states is
+indicated via the flags field; at least ``VFIO_MIGRATION_STOP_COPY`` must be
+set.
+
+There is no data field of the request message.
+
+The data field of the reply message is structured as follows:
+
++-------+--------+---------------------------+
+| Name | Offset | Size |
++=======+========+===========================+
+| flags | 0 | 8 |
++-------+--------+---------------------------+
+| | +-----+--------------------------+ |
+| | | Bit | Definition | |
+| | +=====+==========================+ |
+| | | 0 | VFIO_MIGRATION_STOP_COPY | |
+| | +-----+--------------------------+ |
+| | | 1 | VFIO_MIGRATION_P2P | |
+| | +-----+--------------------------+ |
+| | | 2 | VFIO_MIGRATION_PRE_COPY | |
+| | +-----+--------------------------+ |
++-------+--------+---------------------------+
+
+These flags are interpreted in the same way as VFIO.
+
+``VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE``
+""""""""""""""""""""""""""""""""""""""""
+
+Upon ``VFIO_DEVICE_FEATURE_SET``, execute a migration state change on the VFIO
+device. The new state is supplied in ``device_state``. The state transition must
+fully complete before the reply is sent.
+
+The data field of the reply message, as well as the ``SET`` request message, is
+structured as follows:
+
++--------------+--------+------+
+| Name | Offset | Size |
++==============+========+======+
+| device_state | 0 | 4 |
++--------------+--------+------+
+| data_fd | 4 | 4 |
++--------------+--------+------+
+
+* *device_state* is the current state of the device (for ``GET``) or the
+ state to transition to (for ``SET``). It is defined by the
+ ``vfio_device_mig_state`` enum as detailed below. These states are the states
+ of the device migration Finite State Machine.
+
++--------------------------------+-------+---------------------------------------------------------------------+
+| Name | State | Description |
++================================+=======+=====================================================================+
+| VFIO_DEVICE_STATE_ERROR | 0 | The device has failed and must be reset. |
++--------------------------------+-------+---------------------------------------------------------------------+
+| VFIO_DEVICE_STATE_STOP | 1 | The device does not change the internal or external state. |
++--------------------------------+-------+---------------------------------------------------------------------+
+| VFIO_DEVICE_STATE_RUNNING | 2 | The device is running normally. |
++--------------------------------+-------+---------------------------------------------------------------------+
+| VFIO_DEVICE_STATE_STOP_COPY | 3 | The device internal state can be read out. |
++--------------------------------+-------+---------------------------------------------------------------------+
+| VFIO_DEVICE_STATE_RESUMING | 4 | The device is stopped and is loading a new internal state. |
++--------------------------------+-------+---------------------------------------------------------------------+
+| VFIO_DEVICE_STATE_RUNNING_P2P | 5 | (not used in vfio-user) |
++--------------------------------+-------+---------------------------------------------------------------------+
+| VFIO_DEVICE_STATE_PRE_COPY | 6 | The device is running normally but tracking internal state changes. |
++--------------------------------+-------+---------------------------------------------------------------------+
+| VFIO_DEVICE_STATE_PRE_COPY_P2P | 7 | (not used in vfio-user) |
++--------------------------------+-------+---------------------------------------------------------------------+
+
+* *data_fd* is unused in vfio-user, as the ``VFIO_USER_MIG_DATA_READ`` and
+ ``VFIO_USER_MIG_DATA_WRITE`` messages are used instead for migration data
+ transport.
+
+Direct State Transitions
""""""""""""""""""""""""
+The device migration FSM is a Mealy machine, so actions are taken upon the arcs
+between FSM states. The following transitions need to be supported by the
+server, a subset of those defined in ``<linux/vfio.h>``
+(``enum vfio_device_mig_state``).
+
+* ``RUNNING -> STOP``, ``STOP_COPY -> STOP``: Stop the operation of the device.
+ The ``STOP_COPY`` arc terminates the data transfer session.
+
+* ``RESUMING -> STOP``: Terminate the data transfer session. Complete processing
+ of the migration data. Stop the operation of the device. If the delivered data
+ is found to be incomplete, inconsistent, or otherwise invalid, fail the
+ ``SET`` command and optionally transition to the ``ERROR`` state.
+
+* ``PRE_COPY -> RUNNING``: Terminate the data transfer session. The device is
+ now fully operational.
+
+* ``STOP -> RUNNING``: Start the operation of the device.
+
+* ``RUNNING -> PRE_COPY``, ``STOP -> STOP_COPY``: Begin the process of saving
+ the device state. The device operation is unchanged, but data transfer begins.
+ ``PRE_COPY`` and ``STOP_COPY`` are referred to as the "saving group" of
+ states.
+
+* ``PRE_COPY -> STOP_COPY``: Continue to transfer migration data, but stop
+ device operation.
+
+* ``STOP -> RESUMING``: Start the process of restoring the device state. The
+ internal device state may be changed to prepare the device to receive the
+ migration data.
+
+The ``STOP_COPY -> PRE_COPY`` transition is explicitly not allowed and should
+return an error if requested.
+
+``ERROR`` cannot be specified as a device state, but any transition request can
+be failed and then move the state into ``ERROR`` if the server was unable to
+execute the requested arc AND was unable to restore the device into any valid
+state. To recover from ``ERROR``, ``VFIO_USER_DEVICE_RESET`` must be used to
+return back to ``RUNNING``.
+
+If ``PRE_COPY`` is not supported, arcs touching it are removed.
+
+Complex State Transitions
+"""""""""""""""""""""""""
+
+The remaining possible transitions are to be implemented as combinations of the
+above FSM arcs. As there are multiple paths, the path should be selected based
+on the following rules:
+
+* Select the shortest path.
+
+* The path cannot have saving group states as interior arcs, only start/end
+ states.
+
+``VFIO_DEVICE_FEATURE_DMA_LOGGING_START`` / ``VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP``
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+Upon ``VFIO_DEVICE_FEATURE_SET``, start/stop DMA logging. These features can
+also be probed to determine whether the device supports DMA logging.
+
+When DMA logging is started, a range of IOVAs to monitor is provided and the
+device can optimize its logging to cover only the IOVA range given. Only DMA
+writes are logged.
+
+The data field of the ``SET`` request is structured as follows:
+
++------------+--------+----------+
+| Name | Offset | Size |
++============+========+==========+
+| page_size | 0 | 8 |
++------------+--------+----------+
+| num_ranges | 8 | 4 |
++------------+--------+----------+
+| reserved | 12 | 4 |
++------------+--------+----------+
+| ranges | 16 | variable |
++------------+--------+----------+
+
+* *page_size* hints what tracking granularity the device should try to achieve.
+ If the device cannot do the hinted page size then it's the driver's choice
+ which page size to pick based on its support. On output the device will return
+ the page size it selected.
+
+* *num_ranges* is the number of IOVA ranges to monitor. A value of zero
+ indicates that all writes should be logged.
+
+* *ranges* is an array of ``vfio_user_device_feature_dma_logging_range``
+ entries:
+
+--------+--------+------+
| Name | Offset | Size |
+========+========+======+
| iova | 0 | 8 |
+--------+--------+------+
-| size | 8 | 8 |
-+--------+--------+------+
-| bitmap | 16 | 24 |
+| length | 8 | 8 |
+--------+--------+------+
-* *iova* is the IOVA offset
+ * *iova* is the base IO virtual address
+ * *length* is the length of the range to log
+
+Upon success, the response data field will be the same as the request, unless
+the page size was changed, in which case this will be reflected in the response.
+
+``VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT``
+""""""""""""""""""""""""""""""""""""""""""
+
+Upon ``VFIO_DEVICE_FEATURE_GET``, returns the dirty bitmap for a specific IOVA
+range. This operation is only valid if logging of dirty pages has been
+previously started by setting ``VFIO_DEVICE_FEATURE_DMA_LOGGING_START``.
+
+The data field of the request is structured as follows:
+
++-----------+--------+------+
+| Name | Offset | Size |
++===========+========+======+
+| iova | 0 | 8 |
++-----------+--------+------+
+| length | 8 | 8 |
++-----------+--------+------+
+| page_size | 16 | 8 |
++-----------+--------+------+
+
+* *iova* is the base IO virtual address
+
+* *length* is the length of the range
+
+* *page_size* is the unit of granularity of the bitmap, and must be a power of
+ two. It doesn't have to match the value given to
+ ``VFIO_DEVICE_FEATURE_DMA_LOGGING_START`` because the driver will format its
+ internal logging to match the reporting page size possibly by replicating bits
+ if the internal page size is lower than requested
+
+The data field of the response is identical, except with the bitmap added on
+the end at offset 24.
+
+The bitmap is an array of u64s that holds the output bitmap, with 1 bit
+reporting a *page_size* unit of IOVA. The bits outside of the requested range
+must be zero.
+
+The mapping of IOVA to bits is given by:
+
+``bitmap[(addr - iova)/page_size] & (1ULL << (addr % 64))``
+
+``VFIO_USER_MIG_DATA_READ``
+---------------------------
+
+This command is used to read data from the source migration server while it is
+in a saving group state (``PRE_COPY`` or ``STOP_COPY``).
+
+This command, and ``VFIO_USER_MIG_DATA_WRITE``, are used in place of the
+``data_fd`` file descriptor in ``<linux/vfio.h>``
+(``struct vfio_device_feature_mig_state``) to enable all data transport to use
+the single already-established UNIX socket. Hence, the migration data is
+treated like a stream, so the client must continue reading until no more
+migration data remains.
+
+Request
+^^^^^^^
+
+The request payload for this message is a structure of the following format.
-* *size* is the size of the IOVA region
++-------+--------+------+
+| Name | Offset | Size |
++=======+========+======+
+| argsz | 0 | 4 |
++-------+--------+------+
+| size | 4 | 4 |
++-------+--------+------+
-* *bitmap* is the VFIO Bitmap explained in `VFIO Bitmap`_.
+* *argsz* is the maximum size of the reply payload.
-This part of the request is analogous to VFIO's ``struct
-vfio_iommu_type1_dirty_bitmap_get``.
+* *size* is the size of the migration data to read.
Reply
^^^^^
-For ``VFIO_IOMMU_DIRTY_PAGES_FLAG_START`` or
-``VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP``, there is no reply payload.
-
-For ``VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP``, the reply payload is as follows:
-
-+--------------+--------+-----------------------------------------+
-| Name | Offset | Size |
-+==============+========+=========================================+
-| argsz | 0 | 4 |
-+--------------+--------+-----------------------------------------+
-| flags | 4 | 4 |
-+--------------+--------+-----------------------------------------+
-| | +-----+----------------------------------------+ |
-| | | Bit | Definition | |
-| | +=====+========================================+ |
-| | | 2 | VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP | |
-| | +-----+----------------------------------------+ |
-+--------------+--------+-----------------------------------------+
-| bitmap range | 8 | 40 |
-+--------------+--------+-----------------------------------------+
-| bitmap | 48 | variable |
-+--------------+--------+-----------------------------------------+
-
-* *argsz* is the size required for the full reply payload (dirty pages structure
- + bitmap range structure + actual bitmap)
-* *flags* is ``VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP``
-* *bitmap range* is the same bitmap range struct provided in the request, as
- defined in `VFIO Bitmap Range Format`_.
-* *bitmap* is the actual dirty pages bitmap corresponding to the range request
-
-VFIO Device Migration Info
---------------------------
+The reply payload for this message is a structure of the following format.
+
++-------+--------+----------+
+| Name | Offset | Size |
++=======+========+==========+
+| argsz | 0 | 4 |
++-------+--------+----------+
+| size | 4 | 4 |
++-------+--------+----------+
+| data | 8 | variable |
++-------+--------+----------+
-A device may contain a migration region (of type
-``VFIO_REGION_TYPE_MIGRATION``). The beginning of the region must contain
-``struct vfio_device_migration_info``, defined in ``<linux/vfio.h>``. This
-subregion is accessed like any other part of a standard vfio-user region
-using ``VFIO_USER_REGION_READ``/``VFIO_USER_REGION_WRITE``.
-
-+---------------+--------+--------------------------------+
-| Name | Offset | Size |
-+===============+========+================================+
-| device_state | 0 | 4 |
-+---------------+--------+--------------------------------+
-| | +-----+-------------------------------+ |
-| | | Bit | Definition | |
-| | +=====+===============================+ |
-| | | 0 | VFIO_DEVICE_STATE_V1_RUNNING | |
-| | +-----+-------------------------------+ |
-| | | 1 | VFIO_DEVICE_STATE_V1_SAVING | |
-| | +-----+-------------------------------+ |
-| | | 2 | VFIO_DEVICE_STATE_V1_RESUMING | |
-| | +-----+-------------------------------+ |
-+---------------+--------+--------------------------------+
-| reserved | 4 | 4 |
-+---------------+--------+--------------------------------+
-| pending_bytes | 8 | 8 |
-+---------------+--------+--------------------------------+
-| data_offset | 16 | 8 |
-+---------------+--------+--------------------------------+
-| data_size | 24 | 8 |
-+---------------+--------+--------------------------------+
-
-* *device_state* defines the state of the device:
-
- The client initiates device state transition by writing the intended state.
- The server must respond only after it has successfully transitioned to the new
- state. If an error occurs then the server must respond to the
- ``VFIO_USER_REGION_WRITE`` operation with the Error field set accordingly and
- must remain at the previous state, or in case of internal error it must
- transition to the error state, defined as
- ``VFIO_DEVICE_STATE_V1_RESUMING | VFIO_DEVICE_STATE_V1_SAVING``. The client
- must re-read the device state in order to determine it afresh.
-
- The following device states are defined:
-
- +-----------+---------+----------+-----------------------------------+
- | _RESUMING | _SAVING | _RUNNING | Description |
- +===========+=========+==========+===================================+
- | 0 | 0 | 0 | Device is stopped. |
- +-----------+---------+----------+-----------------------------------+
- | 0 | 0 | 1 | Device is running, default state. |
- +-----------+---------+----------+-----------------------------------+
- | 0 | 1 | 0 | Stop-and-copy state |
- +-----------+---------+----------+-----------------------------------+
- | 0 | 1 | 1 | Pre-copy state |
- +-----------+---------+----------+-----------------------------------+
- | 1 | 0 | 0 | Resuming |
- +-----------+---------+----------+-----------------------------------+
- | 1 | 0 | 1 | Invalid state |
- +-----------+---------+----------+-----------------------------------+
- | 1 | 1 | 0 | Error state |
- +-----------+---------+----------+-----------------------------------+
- | 1 | 1 | 1 | Invalid state |
- +-----------+---------+----------+-----------------------------------+
-
- Valid state transitions are shown in the following table:
-
- +-------------------------+---------+---------+---------------+----------+----------+
- | |darr| From / To |rarr| | Stopped | Running | Stop-and-copy | Pre-copy | Resuming |
- +=========================+=========+=========+===============+==========+==========+
- | Stopped | \- | 1 | 0 | 0 | 0 |
- +-------------------------+---------+---------+---------------+----------+----------+
- | Running | 1 | \- | 1 | 1 | 1 |
- +-------------------------+---------+---------+---------------+----------+----------+
- | Stop-and-copy | 1 | 1 | \- | 0 | 0 |
- +-------------------------+---------+---------+---------------+----------+----------+
- | Pre-copy | 0 | 0 | 1 | \- | 0 |
- +-------------------------+---------+---------+---------------+----------+----------+
- | Resuming | 0 | 1 | 0 | 0 | \- |
- +-------------------------+---------+---------+---------------+----------+----------+
-
- A device is migrated to the destination as follows:
-
- * The source client transitions the device state from the running state to
- the pre-copy state. This transition is optional for the client but must be
- supported by the server. The source server starts sending device state data
- to the source client through the migration region while the device is
- running.
-
- * The source client transitions the device state from the running state or the
- pre-copy state to the stop-and-copy state. The source server stops the
- device, saves device state and sends it to the source client through the
- migration region.
-
- The source client is responsible for sending the migration data to the
- destination client.
-
- A device is resumed on the destination as follows:
-
- * The destination client transitions the device state from the running state
- to the resuming state. The destination server uses the device state data
- received through the migration region to resume the device.
-
- * The destination client provides saved device state to the destination
- server and then transitions the device to back to the running state.
-
-* *reserved* This field is reserved and any access to it must be ignored by the
- server.
-
-* *pending_bytes* Remaining bytes to be migrated by the server. This field is
- read only.
-
-* *data_offset* Offset in the migration region where the client must:
-
- * read from, during the pre-copy or stop-and-copy state, or
-
- * write to, during the resuming state.
-
- This field is read only.
-
-* *data_size* Contains the size, in bytes, of the amount of data copied to:
-
- * the source migration region by the source server during the pre-copy or
- stop-and copy state, or
-
- * the destination migration region by the destination client during the
- resuming state.
-
-Device-specific data must be stored at any position after
-``struct vfio_device_migration_info``. Note that the migration region can be
-memory mappable, even partially. In practise, only the migration data portion
-can be memory mapped.
-
-The client processes device state data during the pre-copy and the
-stop-and-copy state in the following iterative manner:
-
- 1. The client reads ``pending_bytes`` to mark a new iteration. Repeated reads
- of this field is an idempotent operation. If there are no migration data
- to be consumed then the next step depends on the current device state:
-
- * pre-copy: the client must try again.
+* *argsz* is the size of the above structure, including the size of the data.
- * stop-and-copy: this procedure can end and the device can now start
- resuming on the destination.
+* *size* indicates the size of returned migration data. If this is less than the
+ requested size, there is no more migration data to read.
- 2. The client reads ``data_offset``; at this point the server must make
- available a portion of migration data at this offset to be read by the
- client, which must happen *before* completing the read operation. The
- amount of data to be read must be stored in the ``data_size`` field, which
- the client reads next.
+* *data* contains the migration data.
- 3. The client reads ``data_size`` to determine the amount of migration data
- available.
+``VFIO_USER_MIG_DATA_WRITE``
+----------------------------
- 4. The client reads and processes the migration data.
+This command is used to write data to the destination migration server while it
+is in the ``RESUMING`` state.
- 5. Go to step 1.
+As above, this replaces the ``data_fd`` file descriptor for transport of
+migration data, and as such, the migration data is treated like a stream.
-Note that the client can transition the device from the pre-copy state to the
-stop-and-copy state at any time; ``pending_bytes`` does not need to become zero.
+Request
+^^^^^^^
+
+The request payload for this message is a structure of the following format.
+
++-------+--------+----------+
+| Name | Offset | Size |
++=======+========+==========+
+| argsz | 0 | 4 |
++-------+--------+----------+
+| size | 4 | 4 |
++-------+--------+----------+
+| data | 8 | variable |
++-------+--------+----------+
+
+* *argsz* is the maximum size of the reply payload.
+
+* *size* is the size of the migration data to be written.
+
+* *data* contains the migration data.
-The client initializes the device state on the destination by setting the
-device state in the resuming state and writing the migration data to the
-destination migration region at ``data_offset`` offset. The client can write the
-source migration data in an iterative manner and the server must consume this
-data before completing each write operation, updating the ``data_offset`` field.
-The server must apply the source migration data on the device resume state. The
-client must write data on the same order and transaction size as read.
+Reply
+^^^^^
-If an error occurs then the server must fail the read or write operation. It is
-an implementation detail of the client how to handle errors.
+There is no reply payload for this message.
Appendices
==========
diff --git a/include/libvfio-user.h b/include/libvfio-user.h
index 21cb99a..e4cfa60 100644
--- a/include/libvfio-user.h
+++ b/include/libvfio-user.h
@@ -583,21 +583,8 @@ typedef enum {
VFU_MIGR_STATE_RESUME
} vfu_migr_state_t;
-#define VFU_MIGR_CALLBACKS_VERS 1
+#define VFU_MIGR_CALLBACKS_VERS 2
-/*
- * Callbacks during the pre-copy and stop-and-copy phases.
- *
- * The client executes the following steps to copy migration data:
- *
- * 1. get_pending_bytes: device must return amount of migration data
- * 2. prepare_data: device must prepare migration data
- * 3. read_data: device must provide migration data
- *
- * The client repeats the above steps until there is no more migration data to
- * return (the device must return 0 from get_pending_bytes to indicate that
- * there are no more migration data to be consumed in this iteration).
- */
typedef struct {
/*
@@ -615,152 +602,30 @@ typedef struct {
* FIXME maybe we should create a single callback and pass the state?
*/
int (*transition)(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state);
-
- /* Callbacks for saving device state */
-
- /*
- * Function that is called to retrieve the amount of pending migration
- * data. If migration data were previously made available (function
- * prepare_data has been called) then calling this function signifies that
- * they have been read (e.g. migration data can be discarded). If the
- * function returns 0 then migration has finished and this function won't
- * be called again.
- *
- * The amount of pending migration data returned by the device does not
- * necessarily have to monotonically decrease over time and does not need
- * to match the amount of migration data returned via the @size argument in
- * prepare_data. It can completely fluctuate according to the needs of the
- * device. These semantics are derived from the pending_bytes register in
- * VFIO. Therefore the value returned by get_pending_bytes must be
- * primarily regarded as boolean, either 0 or non-zero, as far as migration
- * completion is concerned. More advanced vfio-user clients can make
- * assumptions on how migration is progressing on devices that guarantee
- * that the amount of pending migration data decreases over time.
- */
- uint64_t (*get_pending_bytes)(vfu_ctx_t *vfu_ctx);
-
- /*
- * Function that is called to instruct the device to prepare migration data
- * to be read when in pre-copy or stop-and-copy state, and to prepare for
- * receiving migration data when in resuming state.
- *
- * When in pre-copy and stop-and-copy state, the function must return only
- * after migration data are available at the specified offset. This
- * callback is called once per iteration. The amount of data available
- * pointed to by @size can be different that the amount of data returned by
- * get_pending_bytes in the beginning of the iteration.
- *
- * In VFIO, the data_offset and data_size registers can be read multiple
- * times during an iteration and are invariant, libvfio-user simplifies
- * this by caching the values and returning them when read, guaranteeing
- * that prepare_data() is called only once per migration iteration.
- *
- * When in resuming state, @offset must be set to where migration data must
- * written. @size points to NULL.
- *
- * The callback should return -1 on error, setting errno.
- */
- int (*prepare_data)(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size);
-
+
/*
- * Function that is called to read migration data. offset and size can be
- * any subrange on the offset and size previously returned by prepare_data.
- * The function must return the amount of data read or -1 on error, setting
- * errno.
+ * Function that is called to read `count` bytes of migration data into
+ * `buf`. The function must return the amount of data read or -1 on error,
+ * setting errno. The function may return less data than requested.
*
- * This function can be called even if the migration data can be memory
- * mapped.
+ * If the function returns zero, this is interpreted to mean that there is
+ * no more migration data to read.
*/
- ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf,
- uint64_t count, uint64_t offset);
-
- /* Callbacks for restoring device state */
+ ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count);
/*
- * Fuction that is called for writing previously stored device state. The
+ * Function that is called for writing previously stored device state. The
* function must return the amount of data written or -1 on error, setting
- * errno.
- */
- ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count,
- uint64_t offset);
-
- /*
- * Function that is called when client has written some previously stored
- * device state.
- *
- * The callback should return -1 on error, setting errno.
+ * errno. Partial writes are not supported, so any return value other than
+ * `count` is invalid.
*/
- int (*data_written)(vfu_ctx_t *vfu_ctx, uint64_t count);
+ ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count);
} vfu_migration_callbacks_t;
-/**
- * The definition for VFIO_DEVICE_STATE_XXX differs with the version of vfio
- * header file used. Some old systems wouldn't have these definitions. Some
- * other newer systems would be using region based migration, and not
- * have VFIO_DEVICE_STATE_V1_XXXX defined. The latest ones have
- * VFIO_DEVICE_STATE_V1_XXXX defined. The following addresses all
- * these scenarios.
- */
-#if defined(VFIO_DEVICE_STATE_STOP)
-
-_Static_assert(VFIO_DEVICE_STATE_STOP == 0,
- "incompatible VFIO_DEVICE_STATE_STOP definition");
-
-#define VFIO_DEVICE_STATE_V1_STOP VFIO_DEVICE_STATE_STOP
-#define VFIO_DEVICE_STATE_V1_RUNNING VFIO_DEVICE_STATE_RUNNING
-#define VFIO_DEVICE_STATE_V1_SAVING VFIO_DEVICE_STATE_SAVING
-#define VFIO_DEVICE_STATE_V1_RESUMING VFIO_DEVICE_STATE_RESUMING
-
-#elif !defined(VFIO_REGION_TYPE_MIGRATION_DEPRECATED) /* VFIO_DEVICE_STATE_STOP */
-
-#define VFIO_DEVICE_STATE_V1_STOP (0)
-#define VFIO_DEVICE_STATE_V1_RUNNING (1 << 0)
-#define VFIO_DEVICE_STATE_V1_SAVING (1 << 1)
-#define VFIO_DEVICE_STATE_V1_RESUMING (1 << 2)
-#define VFIO_DEVICE_STATE_MASK ((1 << 3) - 1)
-
-#endif /* VFIO_REGION_TYPE_MIGRATION_DEPRECATED */
-
-/*
- * The currently defined migration registers; if using migration callbacks,
- * these are handled internally by the library.
- *
- * This is analogous to struct vfio_device_migration_info.
- */
-struct vfio_user_migration_info {
- /* VFIO_DEVICE_STATE_* */
- uint32_t device_state;
- uint32_t reserved;
- uint64_t pending_bytes;
- uint64_t data_offset;
- uint64_t data_size;
-};
-
-/*
- * Returns the size of the area needed to hold the migration registers at the
- * beginning of the migration region; guaranteed to be page aligned.
- */
-size_t
-vfu_get_migr_register_area_size(void);
-
-/**
- * vfu_setup_device_migration provides an abstraction over the migration
- * protocol: the user specifies a set of callbacks which are called in response
- * to client accesses of the migration region; the migration region read/write
- * callbacks are not called after this function call. Offsets in callbacks are
- * relative to @data_offset.
- *
- * @vfu_ctx: the libvfio-user context
- * @callbacks: migration callbacks
- * @data_offset: offset in the migration region where data begins.
- *
- * @returns 0 on success, -1 on error, sets errno.
- */
int
vfu_setup_device_migration_callbacks(vfu_ctx_t *vfu_ctx,
- const vfu_migration_callbacks_t *callbacks,
- uint64_t data_offset);
+ const vfu_migration_callbacks_t *callbacks);
/**
* Triggers an interrupt.
@@ -906,7 +771,6 @@ enum {
VFU_PCI_DEV_ROM_REGION_IDX,
VFU_PCI_DEV_CFG_REGION_IDX,
VFU_PCI_DEV_VGA_REGION_IDX,
- VFU_PCI_DEV_MIGR_REGION_IDX,
VFU_PCI_DEV_NUM_REGIONS,
};
diff --git a/include/vfio-user.h b/include/vfio-user.h
index a749938..0b115d3 100644
--- a/include/vfio-user.h
+++ b/include/vfio-user.h
@@ -66,7 +66,10 @@ enum vfio_user_command {
VFIO_USER_DMA_READ = 11,
VFIO_USER_DMA_WRITE = 12,
VFIO_USER_DEVICE_RESET = 13,
- VFIO_USER_DIRTY_PAGES = 14,
+ VFIO_USER_REGION_WRITE_MULTI = 15,
+ VFIO_USER_DEVICE_FEATURE = 16,
+ VFIO_USER_MIG_DATA_READ = 17,
+ VFIO_USER_MIG_DATA_WRITE = 18,
VFIO_USER_MAX,
};
@@ -200,31 +203,97 @@ typedef struct vfio_user_region_io_fds_reply {
} sub_regions[];
} __attribute__((packed)) vfio_user_region_io_fds_reply_t;
+/* Analogous to struct vfio_device_feature_dma_logging_range */
+struct vfio_user_device_feature_dma_logging_range {
+ uint64_t iova;
+ uint64_t length;
+} __attribute__((packed));
-/* Analogous to vfio_iommu_type1_dirty_bitmap. */
-struct vfio_user_dirty_pages {
- uint32_t argsz;
-#ifndef VFIO_IOMMU_DIRTY_PAGES_FLAG_START
-#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0)
-#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1)
-#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
-#endif
- uint32_t flags;
+/* Analogous to struct vfio_device_feature_dma_logging_control */
+struct vfio_user_device_feature_dma_logging_control {
+ uint64_t page_size;
+ uint32_t num_ranges;
+ uint32_t reserved;
+ struct vfio_user_device_feature_dma_logging_range ranges[];
} __attribute__((packed));
-/* Analogous to struct vfio_iommu_type1_dirty_bitmap_get. */
-struct vfio_user_bitmap_range {
+/* Analogous to struct vfio_device_feature_dma_logging_report */
+struct vfio_user_device_feature_dma_logging_report {
uint64_t iova;
- uint64_t size;
- struct vfio_user_bitmap bitmap;
+ uint64_t length;
+ uint64_t page_size;
+ uint8_t bitmap[];
+} __attribute__((packed));
+
+#ifndef VFIO_DEVICE_FEATURE_DMA_LOGGING_START
+#define VFIO_DEVICE_FEATURE_DMA_LOGGING_START 6
+#define VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP 7
+#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8
+#endif
+
+/* Analogous to struct vfio_device_feature */
+struct vfio_user_device_feature {
+ uint32_t argsz;
+ uint32_t flags;
+#ifndef VFIO_DEVICE_FEATURE_MASK
+#define VFIO_DEVICE_FEATURE_MASK (0xffff) /* 16-bit feature index */
+#define VFIO_DEVICE_FEATURE_GET (1 << 16) /* Get feature into data[] */
+#define VFIO_DEVICE_FEATURE_SET (1 << 17) /* Set feature from data[] */
+#define VFIO_DEVICE_FEATURE_PROBE (1 << 18) /* Probe feature support */
+#endif
+ uint8_t data[];
+} __attribute__((packed));
+
+/* Analogous to struct vfio_device_feature_migration */
+struct vfio_user_device_feature_migration {
+ uint64_t flags;
+#ifndef VFIO_REGION_TYPE_MIGRATION_DEPRECATED
+#define VFIO_MIGRATION_STOP_COPY (1 << 0)
+#define VFIO_MIGRATION_P2P (1 << 1)
+#endif
+/*
+ * PRE_COPY was added in a later kernel version, after
+ * VFIO_REGION_TYPE_MIGRATION_DEPRECATED had been introduced.
+ */
+#ifndef VFIO_MIGRATION_PRE_COPY
+#define VFIO_MIGRATION_PRE_COPY (1 << 2)
+#endif
} __attribute__((packed));
+#ifndef VFIO_REGION_TYPE_MIGRATION_DEPRECATED
+#define VFIO_DEVICE_FEATURE_MIGRATION 1
+#endif
+_Static_assert(sizeof(struct vfio_user_device_feature_migration) == 8,
+ "bad vfio_user_device_feature_migration size");
-#ifndef VFIO_REGION_TYPE_MIGRATION
+/* Analogous to struct vfio_device_feature_mig_state */
+struct vfio_user_device_feature_mig_state {
+ uint32_t device_state;
+ uint32_t data_fd;
+} __attribute__((packed));
+#ifndef VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
+#define VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE 2
+#endif
+_Static_assert(sizeof(struct vfio_user_device_feature_migration) == 8,
+ "bad vfio_user_device_feature_mig_state size");
-#define VFIO_REGION_TYPE_MIGRATION (3)
-#define VFIO_REGION_SUBTYPE_MIGRATION (1)
+/* Analogous to enum vfio_device_mig_state */
+enum vfio_user_device_mig_state {
+ VFIO_USER_DEVICE_STATE_ERROR = 0,
+ VFIO_USER_DEVICE_STATE_STOP = 1,
+ VFIO_USER_DEVICE_STATE_RUNNING = 2,
+ VFIO_USER_DEVICE_STATE_STOP_COPY = 3,
+ VFIO_USER_DEVICE_STATE_RESUMING = 4,
+ VFIO_USER_DEVICE_STATE_RUNNING_P2P = 5,
+ VFIO_USER_DEVICE_STATE_PRE_COPY = 6,
+ VFIO_USER_DEVICE_STATE_PRE_COPY_P2P = 7,
+ VFIO_USER_DEVICE_NUM_STATES = 8,
+};
-#endif /* VFIO_REGION_TYPE_MIGRATION */
+struct vfio_user_mig_data {
+ uint32_t argsz;
+ uint32_t size;
+ uint8_t data[];
+} __attribute__((packed));
#ifdef __cplusplus
}
diff --git a/lib/common.h b/lib/common.h
index 07a74a5..40b9b27 100644
--- a/lib/common.h
+++ b/lib/common.h
@@ -41,6 +41,7 @@
#include <limits.h>
#include <stdint.h>
#include <unistd.h>
+#include <sys/uio.h>
#define UNUSED __attribute__((unused))
#define EXPORT __attribute__((visibility("default")))
@@ -62,6 +63,20 @@
typedef unsigned long long ull_t;
+static inline int
+ERROR_INT(int err)
+{
+ errno = err;
+ return -1;
+}
+
+static inline void *
+ERROR_PTR(int err)
+{
+ errno = err;
+ return NULL;
+}
+
/* Saturating uint64_t addition. */
static inline uint64_t
satadd_u64(uint64_t a, uint64_t b)
@@ -73,11 +88,21 @@ satadd_u64(uint64_t a, uint64_t b)
/*
* The size, in bytes, of the bitmap that represents the given range with the
* given page size.
+ *
+ * Returns -1 and sets errno if the given page size is invalid for the given
+ * range.
*/
-static inline size_t
-_get_bitmap_size(size_t size, size_t pgsize)
+static inline ssize_t
+get_bitmap_size(size_t region_size, size_t pgsize)
{
- size_t nr_pages = (size / pgsize) + (size % pgsize != 0);
+ if (pgsize == 0) {
+ return ERROR_INT(EINVAL);
+ }
+ if (region_size < pgsize) {
+ return ERROR_INT(EINVAL);
+ }
+
+ size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0);
return ROUND_UP(nr_pages, sizeof(uint64_t) * CHAR_BIT) / CHAR_BIT;
}
@@ -107,6 +132,16 @@ close_safely(int *fd)
errno = saved_errno;
}
+static inline void
+iov_free(struct iovec *iov)
+{
+ if (iov->iov_base != NULL) {
+ free(iov->iov_base);
+ iov->iov_base = NULL;
+ }
+ iov->iov_len = 0;
+}
+
#ifdef UNIT_TEST
#define MOCK_DEFINE(f) \
diff --git a/lib/dma.c b/lib/dma.c
index 9ca34d0..10e38ff 100644
--- a/lib/dma.c
+++ b/lib/dma.c
@@ -255,19 +255,6 @@ dma_map_region(dma_controller_t *dma, dma_memory_region_t *region)
return 0;
}
-static ssize_t
-get_bitmap_size(size_t region_size, size_t pgsize)
-{
- if (pgsize == 0) {
- return ERROR_INT(EINVAL);
- }
- if (region_size < pgsize) {
- return ERROR_INT(EINVAL);
- }
-
- return _get_bitmap_size(region_size, pgsize);
-}
-
static int
dirty_page_logging_start_on_region(dma_memory_region_t *region, size_t pgsize)
{
@@ -530,28 +517,173 @@ dma_controller_dirty_page_logging_stop(dma_controller_t *dma)
#ifdef DEBUG
static void
log_dirty_bitmap(vfu_ctx_t *vfu_ctx, dma_memory_region_t *region,
- char *bitmap, size_t size)
+ char *bitmap, size_t size, size_t pgsize)
{
size_t i;
size_t count;
for (i = 0, count = 0; i < size; i++) {
count += __builtin_popcount((uint8_t)bitmap[i]);
}
- vfu_log(vfu_ctx, LOG_DEBUG, "dirty pages: get [%p, %p), %zu dirty pages",
+ vfu_log(vfu_ctx, LOG_DEBUG,
+ "dirty pages: get [%p, %p), %zu dirty pages of size %zu",
region->info.iova.iov_base, iov_end(&region->info.iova),
- count);
+ count, pgsize);
}
#endif
+static void
+dirty_page_exchange(uint8_t *outp, uint8_t *bitmap)
+{
+ /*
+ * If no bits are dirty, avoid the atomic exchange. This is obviously
+ * racy, but it's OK: if we miss a dirty bit being set, we'll catch it
+ * the next time around.
+ *
+ * Otherwise, atomically exchange the dirty bits with zero: as we use
+ * atomic or in _dma_mark_dirty(), this cannot lose set bits - we might
+ * miss a bit being set after, but again, we'll catch that next time
+ * around.
+ */
+ if (*bitmap == 0) {
+ *outp = 0;
+ } else {
+ uint8_t zero = 0;
+ __atomic_exchange(bitmap, &zero, outp, __ATOMIC_SEQ_CST);
+ }
+}
+
+static void
+dirty_page_get_same_pgsize(dma_memory_region_t *region, char *bitmap,
+ size_t bitmap_size)
+{
+ for (size_t i = 0; i < bitmap_size; i++) {
+ dirty_page_exchange((uint8_t *)&bitmap[i], &region->dirty_bitmap[i]);
+ }
+}
+
+static void
+dirty_page_get_extend(dma_memory_region_t *region, char *bitmap,
+ size_t server_bitmap_size, size_t server_pgsize,
+ size_t client_bitmap_size, size_t client_pgsize)
+{
+ /*
+ * The index of the bit in the client bitmap that we are currently
+ * considering. By keeping track of this separately to the for loop, we
+ * allow for one server bit to be repeated for multiple client bytes.
+ */
+ uint8_t client_bit_idx = 0;
+ size_t server_byte_idx;
+ int server_bit_idx;
+ size_t factor = server_pgsize / client_pgsize;
+
+ /*
+ * Iterate through the bytes of the server bitmap.
+ */
+ for (server_byte_idx = 0; server_byte_idx < server_bitmap_size;
+ server_byte_idx++) {
+
+ if (client_bit_idx / CHAR_BIT >= client_bitmap_size) {
+ break;
+ }
+
+ uint8_t out = 0;
+
+ dirty_page_exchange(&out, &region->dirty_bitmap[server_byte_idx]);
+
+ /*
+ * Iterate through the bits of the server byte, repeating bits to reach
+ * the desired page size.
+ */
+ for (server_bit_idx = 0; server_bit_idx < CHAR_BIT; server_bit_idx++) {
+ uint8_t server_bit = (out >> server_bit_idx) & 1;
+
+ /*
+ * Repeat `factor` times the bit at index `j` of `out`.
+ *
+ * OR the same bit from the server bitmap (`server_bit`) with
+ * `factor` bits in the client bitmap, from `client_bit_idx` to
+ * `end_client_bit_idx`.
+ */
+ for (size_t end_client_bit_idx = client_bit_idx + factor;
+ client_bit_idx < end_client_bit_idx;
+ client_bit_idx++) {
+
+ bitmap[client_bit_idx / CHAR_BIT] |=
+ server_bit << (client_bit_idx % CHAR_BIT);
+ }
+ }
+ }
+}
+
+static void
+dirty_page_get_combine(dma_memory_region_t *region, char *bitmap,
+ size_t server_bitmap_size, size_t server_pgsize,
+ size_t client_bitmap_size, size_t client_pgsize)
+{
+ /*
+ * The index of the bit in the client bitmap that we are currently
+ * considering. By keeping track of this separately to the for loop, we
+ * allow multiple bytes' worth of server bits to be OR'd together to
+ * calculate one client bit.
+ */
+ uint8_t client_bit_idx = 0;
+ size_t server_byte_idx;
+ int server_bit_idx;
+ size_t factor = client_pgsize / server_pgsize;
+
+ /*
+ * Iterate through the bytes of the server bitmap.
+ */
+ for (server_byte_idx = 0; server_byte_idx < server_bitmap_size;
+ server_byte_idx++) {
+
+ if (client_bit_idx / CHAR_BIT >= client_bitmap_size) {
+ break;
+ }
+
+ uint8_t out = 0;
+
+ dirty_page_exchange(&out, &region->dirty_bitmap[server_byte_idx]);
+
+ /*
+ * Iterate through the bits of the server byte, combining bits to reach
+ * the desired page size.
+ */
+ for (server_bit_idx = 0; server_bit_idx < CHAR_BIT; server_bit_idx++) {
+ uint8_t server_bit = (out >> server_bit_idx) & 1;
+
+ /*
+ * OR `factor` bits of the server bitmap with the same bit at
+ * index `client_bit_idx` in the client bitmap.
+ */
+ bitmap[client_bit_idx / CHAR_BIT] |=
+ server_bit << (client_bit_idx % CHAR_BIT);
+
+ /*
+ * Only move onto the next bit in the client bitmap once we've
+ * OR'd `factor` bits.
+ */
+ if (((server_byte_idx * CHAR_BIT) + server_bit_idx) % factor
+ == factor - 1) {
+ client_bit_idx++;
+
+ if (client_bit_idx / CHAR_BIT >= client_bitmap_size) {
+ return;
+ }
+ }
+ }
+ }
+}
+
int
dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
- uint64_t len, size_t pgsize, size_t size,
+ uint64_t len, size_t client_pgsize, size_t size,
char *bitmap)
{
dma_memory_region_t *region;
- ssize_t bitmap_size;
+ ssize_t server_bitmap_size;
+ ssize_t client_bitmap_size;
dma_sg_t sg;
- size_t i;
int ret;
assert(dma != NULL);
@@ -574,24 +706,40 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
return ERROR_INT(ENOTSUP);
}
- if (pgsize != dma->dirty_pgsize) {
- vfu_log(dma->vfu_ctx, LOG_ERR, "bad page size %zu", pgsize);
+ /*
+ * If dirty page logging is not enabled, the requested page size is zero,
+ * or the requested page size is not a power of two, return an error.
+ */
+ if (dma->dirty_pgsize == 0) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "dirty page logging not enabled");
+ return ERROR_INT(EINVAL);
+ }
+ if (client_pgsize == 0 || (client_pgsize & (client_pgsize - 1)) != 0) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "bad client page size %zu",
+ client_pgsize);
return ERROR_INT(EINVAL);
}
- bitmap_size = get_bitmap_size(len, pgsize);
- if (bitmap_size < 0) {
- vfu_log(dma->vfu_ctx, LOG_ERR, "failed to get bitmap size");
- return bitmap_size;
+ server_bitmap_size = get_bitmap_size(len, dma->dirty_pgsize);
+ if (server_bitmap_size < 0) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "failed to get server bitmap size");
+ return server_bitmap_size;
+ }
+
+ client_bitmap_size = get_bitmap_size(len, client_pgsize);
+ if (client_bitmap_size < 0) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "bad client page size %zu",
+ client_pgsize);
+ return client_bitmap_size;
}
/*
* They must be equal because this is how much data the client expects to
* receive.
*/
- if (size != (size_t)bitmap_size) {
- vfu_log(dma->vfu_ctx, LOG_ERR, "bad bitmap size %zu != %zu", size,
- bitmap_size);
+ if (size != (size_t)client_bitmap_size) {
+ vfu_log(dma->vfu_ctx, LOG_ERR, "bad client bitmap size %zu != %zu",
+ size, client_bitmap_size);
return ERROR_INT(EINVAL);
}
@@ -602,31 +750,29 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
return ERROR_INT(EINVAL);
}
- for (i = 0; i < (size_t)bitmap_size; i++) {
- uint8_t val = region->dirty_bitmap[i];
- uint8_t *outp = (uint8_t *)&bitmap[i];
-
+ if (client_pgsize == dma->dirty_pgsize) {
+ dirty_page_get_same_pgsize(region, bitmap, client_bitmap_size);
+ } else if (client_pgsize < dma->dirty_pgsize) {
/*
- * If no bits are dirty, avoid the atomic exchange. This is obviously
- * racy, but it's OK: if we miss a dirty bit being set, we'll catch it
- * the next time around.
- *
- * Otherwise, atomically exchange the dirty bits with zero: as we use
- * atomic or in _dma_mark_dirty(), this cannot lose set bits - we might
- * miss a bit being set after, but again, we'll catch that next time
- * around.
+ * If the requested page size is less than that used for logging by
+ * the server, the bitmap will need to be extended, repeating bits.
*/
- if (val == 0) {
- *outp = 0;
- } else {
- uint8_t zero = 0;
- __atomic_exchange(&region->dirty_bitmap[i], &zero,
- outp, __ATOMIC_SEQ_CST);
- }
+ dirty_page_get_extend(region, bitmap, server_bitmap_size,
+ dma->dirty_pgsize, client_bitmap_size,
+ client_pgsize);
+ } else {
+ /*
+ * If the requested page size is larger than that used for logging by
+ * the server, the bitmap will need to combine bits with OR, losing
+ * accuracy.
+ */
+ dirty_page_get_combine(region, bitmap, server_bitmap_size,
+ dma->dirty_pgsize, client_bitmap_size,
+ client_pgsize);
}
#ifdef DEBUG
- log_dirty_bitmap(dma->vfu_ctx, region, bitmap, size);
+ log_dirty_bitmap(dma->vfu_ctx, region, bitmap, size, client_pgsize);
#endif
return 0;
diff --git a/lib/dma.h b/lib/dma.h
index 9687f49..789904f 100644
--- a/lib/dma.h
+++ b/lib/dma.h
@@ -386,6 +386,7 @@ int
dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
uint64_t len, size_t pgsize, size_t size,
char *bitmap);
+
bool
dma_sg_is_mappable(const dma_controller_t *dma, const dma_sg_t *sg);
diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c
index 271a269..81b0010 100644
--- a/lib/libvfio-user.c
+++ b/lib/libvfio-user.c
@@ -83,21 +83,16 @@ vfu_log(vfu_ctx_t *vfu_ctx, int level, const char *fmt, ...)
}
static size_t
-get_vfio_caps_size(bool is_migr_reg, vfu_reg_info_t *reg)
+get_vfio_caps_size(vfu_reg_info_t *reg)
{
- size_t type_size = 0;
size_t sparse_size = 0;
- if (is_migr_reg) {
- type_size = sizeof(struct vfio_region_info_cap_type);
- }
-
if (reg->nr_mmap_areas != 0) {
sparse_size = sizeof(struct vfio_region_info_cap_sparse_mmap)
+ (reg->nr_mmap_areas * sizeof(struct vfio_region_sparse_mmap_area));
}
- return type_size + sparse_size;
+ return sparse_size;
}
/*
@@ -106,7 +101,7 @@ get_vfio_caps_size(bool is_migr_reg, vfu_reg_info_t *reg)
* points accordingly.
*/
static int
-dev_get_caps(vfu_ctx_t *vfu_ctx, vfu_reg_info_t *vfu_reg, bool is_migr_reg,
+dev_get_caps(vfu_ctx_t *vfu_ctx, vfu_reg_info_t *vfu_reg,
struct vfio_region_info *vfio_reg, int **fds, size_t *nr_fds)
{
struct vfio_info_cap_header *header;
@@ -120,16 +115,6 @@ dev_get_caps(vfu_ctx_t *vfu_ctx, vfu_reg_info_t *vfu_reg, bool is_migr_reg,
header = (struct vfio_info_cap_header*)(vfio_reg + 1);
- if (is_migr_reg) {
- type = (struct vfio_region_info_cap_type *)header;
- type->header.id = VFIO_REGION_INFO_CAP_TYPE;
- type->header.version = 1;
- type->header.next = 0;
- type->type = VFIO_REGION_TYPE_MIGRATION;
- type->subtype = VFIO_REGION_SUBTYPE_MIGRATION;
- vfio_reg->cap_offset = sizeof(struct vfio_region_info);
- }
-
if (vfu_reg->mmap_areas != NULL) {
int i, nr_mmap_areas = vfu_reg->nr_mmap_areas;
if (type != NULL) {
@@ -218,14 +203,6 @@ region_access(vfu_ctx_t *vfu_ctx, size_t region, char *buf,
if (ret == -1) {
goto out;
}
- } else if (region == VFU_PCI_DEV_MIGR_REGION_IDX) {
- if (vfu_ctx->migration == NULL) {
- vfu_log(vfu_ctx, LOG_ERR, "migration not enabled");
- ret = ERROR_INT(EINVAL);
- goto out;
- }
-
- ret = migration_region_access(vfu_ctx, buf, count, offset, is_write);
} else {
vfu_region_access_cb_t *cb = vfu_ctx->reg_info[region].cb;
@@ -293,8 +270,7 @@ is_valid_region_access(vfu_ctx_t *vfu_ctx, size_t size, uint16_t cmd,
return false;
}
- if (unlikely(device_is_stopped_and_copying(vfu_ctx->migration) &&
- index != VFU_PCI_DEV_MIGR_REGION_IDX)) {
+ if (unlikely(device_is_stopped_and_copying(vfu_ctx->migration))) {
vfu_log(vfu_ctx, LOG_ERR,
"cannot access region %zu while device in stop-and-copy state",
index);
@@ -421,8 +397,7 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
vfu_reg = &vfu_ctx->reg_info[in_info->index];
if (vfu_reg->size > 0) {
- caps_size = get_vfio_caps_size(in_info->index == VFU_PCI_DEV_MIGR_REGION_IDX,
- vfu_reg);
+ caps_size = get_vfio_caps_size(vfu_reg);
}
msg->out.iov.iov_len = MIN(sizeof(*out_info) + caps_size, in_info->argsz);
@@ -457,9 +432,8 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
/* Only actually provide the caps if they fit. */
if (in_info->argsz >= out_info->argsz) {
out_info->flags |= VFIO_REGION_INFO_FLAG_CAPS;
- ret = dev_get_caps(vfu_ctx, vfu_reg,
- in_info->index == VFU_PCI_DEV_MIGR_REGION_IDX,
- out_info, &msg->out.fds, &msg->out.nr_fds);
+ ret = dev_get_caps(vfu_ctx, vfu_reg, out_info, &msg->out.fds,
+ &msg->out.nr_fds);
if (ret < 0) {
return ret;
}
@@ -917,133 +891,320 @@ static int
device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t reason)
{
int ret;
-
+
ret = call_reset_cb(vfu_ctx, reason);
if (ret < 0) {
return ret;
}
if (vfu_ctx->migration != NULL) {
- return handle_device_state(vfu_ctx, vfu_ctx->migration,
- VFIO_DEVICE_STATE_V1_RUNNING, false);
+ migr_state_transition(vfu_ctx->migration,
+ VFIO_USER_DEVICE_STATE_RUNNING);
}
return 0;
}
-static int
-handle_dirty_pages_get(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
+static uint32_t
+device_feature_flags_supported(vfu_ctx_t *vfu_ctx, uint32_t feature)
{
- struct vfio_user_dirty_pages *dirty_pages_in;
- struct vfio_user_dirty_pages *dirty_pages_out;
- struct vfio_user_bitmap_range *range_in;
- struct vfio_user_bitmap_range *range_out;
- size_t argsz;
- int ret;
+ if (vfu_ctx->migration == NULL) {
+ /*
+ * All of the current features require migration.
+ */
+ return 0;
+ }
+ switch (feature) {
+ case VFIO_DEVICE_FEATURE_MIGRATION:
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
+ return VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE;
+ case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
+ return VFIO_DEVICE_FEATURE_GET
+ | VFIO_DEVICE_FEATURE_SET
+ | VFIO_DEVICE_FEATURE_PROBE;
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
+ return VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_PROBE;
+ default:
+ return 0;
+ };
+}
- dirty_pages_in = msg->in.iov.iov_base;
+static bool
+is_migration_feature(uint32_t feature)
+{
+ switch (feature) {
+ case VFIO_DEVICE_FEATURE_MIGRATION:
+ case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
+ return true;
+ }
- if (msg->in.iov.iov_len < sizeof(*dirty_pages_in) + sizeof(*range_in) ||
- dirty_pages_in->argsz > SERVER_MAX_DATA_XFER_SIZE ||
- dirty_pages_in->argsz < sizeof(*dirty_pages_out)) {
- vfu_log(vfu_ctx, LOG_ERR, "invalid message size=%zu argsz=%u",
- msg->in.iov.iov_len, dirty_pages_in->argsz);
- return ERROR_INT(EINVAL);
+ return false;
+}
+
+static bool
+is_dma_feature(uint32_t feature)
+{
+ switch (feature) {
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
+ case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
+ return true;
}
- range_in = msg->in.iov.iov_base + sizeof(*dirty_pages_in);
+ return false;
+}
- /*
- * range_in is client-controlled, but we only need to protect against
- * overflow here: we'll take MIN() against a validated value next, and
- * dma_controller_dirty_page_get() will validate the actual ->bitmap.size
- * value later, anyway.
+static int
+handle_migration_device_feature_get(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg,
+ struct vfio_user_device_feature *req)
+{
+ /*
+ * All supported outgoing data is currently the same size as
+ * struct vfio_user_device_feature_migration.
*/
- argsz = satadd_u64(sizeof(*dirty_pages_out) + sizeof(*range_out),
- range_in->bitmap.size);
+ msg->out.iov.iov_len = sizeof(struct vfio_user_device_feature)
+ + sizeof(struct vfio_user_device_feature_migration);
+
+ if (req->argsz < msg->out.iov.iov_len) {
+ iov_free(&msg->out.iov);
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_base = calloc(1, msg->out.iov.iov_len);
- msg->out.iov.iov_len = MIN(dirty_pages_in->argsz, argsz);
- msg->out.iov.iov_base = malloc(msg->out.iov.iov_len);
if (msg->out.iov.iov_base == NULL) {
- return -1;
+ return ERROR_INT(ENOMEM);
}
- dirty_pages_out = msg->out.iov.iov_base;
- memcpy(dirty_pages_out, dirty_pages_in, sizeof(*dirty_pages_out));
- dirty_pages_out->argsz = argsz;
- /*
- * If the reply doesn't fit, reply with just the dirty pages header, giving
- * the needed argsz. Typically this shouldn't happen, as the client knows
- * the needed reply size and has already provided the correct bitmap size.
- */
- if (dirty_pages_in->argsz >= argsz) {
- void *bitmap_out = msg->out.iov.iov_base + sizeof(*dirty_pages_out)
- + sizeof(*range_out);
- range_out = msg->out.iov.iov_base + sizeof(*dirty_pages_out);
- memcpy(range_out, range_in, sizeof(*range_out));
- ret = dma_controller_dirty_page_get(vfu_ctx->dma,
- (vfu_dma_addr_t)(uintptr_t)range_in->iova,
- range_in->size,
- range_in->bitmap.pgsize,
- range_in->bitmap.size, bitmap_out);
- if (ret != 0) {
- ret = errno;
- vfu_log(vfu_ctx, LOG_WARNING,
- "failed to get dirty bitmap from DMA controller: %m");
- free(msg->out.iov.iov_base);
- msg->out.iov.iov_base = NULL;
- msg->out.iov.iov_len = 0;
- return ERROR_INT(ret);
+ memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base,
+ sizeof(struct vfio_user_device_feature));
+
+ struct vfio_user_device_feature *res = msg->out.iov.iov_base;
+ res->argsz = msg->out.iov.iov_len;
+
+ switch (req->flags & VFIO_DEVICE_FEATURE_MASK) {
+ case VFIO_DEVICE_FEATURE_MIGRATION: {
+ struct vfio_user_device_feature_migration *mig =
+ (void *)res->data;
+ // FIXME are these always supported? Can we consider to be
+ // "supported" if said support is just an empty callback?
+ //
+ // We don't need to return RUNNING or ERROR since they are
+ // always supported.
+ mig->flags = VFIO_MIGRATION_STOP_COPY
+ | VFIO_MIGRATION_PRE_COPY;
+ return 0;
}
- } else {
- vfu_log(vfu_ctx, LOG_ERR,
- "dirty pages: get [%#llx, %#llx): buffer too small (%u < %zu)",
- (ull_t)range_in->iova, (ull_t)range_in->iova + range_in->size,
- dirty_pages_in->argsz, argsz);
+
+ case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: {
+ struct vfio_user_device_feature_mig_state *state =
+ (void *)res->data;
+ state->device_state = migration_get_state(vfu_ctx);
+ return 0;
+ }
+
+ default:
+ vfu_log(vfu_ctx, LOG_ERR, "invalid flags for migration GET (%d)",
+ req->flags);
+ return ERROR_INT(EINVAL);
}
+}
- return 0;
+static int
+handle_migration_device_feature_set(vfu_ctx_t *vfu_ctx, uint32_t feature,
+ struct vfio_user_device_feature *res)
+{
+ assert(feature == VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE);
+
+ struct vfio_user_device_feature_mig_state *state = (void *)res->data;
+
+ return migration_set_state(vfu_ctx, state->device_state);
}
static int
-handle_dirty_pages(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
+handle_dma_device_feature_get(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg,
+ struct vfio_user_device_feature *req)
{
- struct vfio_user_dirty_pages *dirty_pages = msg->in.iov.iov_base;
- int ret;
+ const size_t header_size = sizeof(struct vfio_user_device_feature)
+ + sizeof(struct vfio_user_device_feature_dma_logging_report);
+
+ struct vfio_user_device_feature_dma_logging_report *rep =
+ (void *)req->data;
+
+ dma_controller_t *dma = vfu_ctx->dma;
+
+ if (dma == NULL) {
+ vfu_log(vfu_ctx, LOG_ERR, "DMA not enabled for DMA device feature");
+ return ERROR_INT(EINVAL);
+ }
+
+ ssize_t bitmap_size = get_bitmap_size(rep->length, rep->page_size);
+ if (bitmap_size < 0) {
+ return bitmap_size;
+ }
+
+ msg->out.iov.iov_len = header_size + bitmap_size;
+
+ if (req->argsz < msg->out.iov.iov_len) {
+ iov_free(&msg->out.iov);
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_base = calloc(1, msg->out.iov.iov_len);
+
+ if (msg->out.iov.iov_base == NULL) {
+ return ERROR_INT(ENOMEM);
+ }
+
+ memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base, header_size);
+
+ struct vfio_user_device_feature *res = msg->out.iov.iov_base;
+
+ res->argsz = msg->out.iov.iov_len;
+ char *bitmap = (char *)msg->out.iov.iov_base + header_size;
+
+ int ret = dma_controller_dirty_page_get(dma,
+ (vfu_dma_addr_t) rep->iova,
+ rep->length,
+ rep->page_size,
+ bitmap_size,
+ bitmap);
+
+ if (ret < 0) {
+ iov_free(&msg->out.iov);
+ }
+
+ return ret;
+}
+
+static int
+handle_dma_device_feature_set(vfu_ctx_t *vfu_ctx, uint32_t feature,
+ struct vfio_user_device_feature *res)
+{
+ dma_controller_t *dma = vfu_ctx->dma;
+
+ assert(dma != NULL);
+
+ if (feature == VFIO_DEVICE_FEATURE_DMA_LOGGING_START) {
+ struct vfio_user_device_feature_dma_logging_control *ctl =
+ (void *)res->data;
+ return dma_controller_dirty_page_logging_start(dma,
+ ctl->page_size);
+ }
+
+ assert(feature == VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP);
+
+ dma_controller_dirty_page_logging_stop(dma);
+ return 0;
+}
+
+static int
+handle_device_feature(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
+{
assert(vfu_ctx != NULL);
assert(msg != NULL);
- if (msg->in.iov.iov_len < sizeof(*dirty_pages) ||
- dirty_pages->argsz < sizeof(*dirty_pages)) {
- vfu_log(vfu_ctx, LOG_ERR, "invalid message size %zu", msg->in.iov.iov_len);
+ if (msg->in.iov.iov_len < sizeof(struct vfio_user_device_feature)) {
+ vfu_log(vfu_ctx, LOG_ERR, "message too short (%ld)",
+ msg->in.iov.iov_len);
return ERROR_INT(EINVAL);
}
- if (vfu_ctx->migration == NULL) {
- vfu_log(vfu_ctx, LOG_ERR, "migration not configured");
- return ERROR_INT(ENOTSUP);
+ struct vfio_user_device_feature *req = msg->in.iov.iov_base;
+
+ uint32_t operations = req->flags & ~VFIO_DEVICE_FEATURE_MASK;
+ uint32_t feature = req->flags & VFIO_DEVICE_FEATURE_MASK;
+
+ uint32_t supported_ops = device_feature_flags_supported(vfu_ctx, feature);
+
+ if ((req->flags & supported_ops) != operations || supported_ops == 0) {
+ vfu_log(vfu_ctx, LOG_ERR, "unsupported operation(s), flags=%d",
+ req->flags);
+ return ERROR_INT(EINVAL);
}
- switch (dirty_pages->flags) {
- case VFIO_IOMMU_DIRTY_PAGES_FLAG_START:
- ret = dma_controller_dirty_page_logging_start(vfu_ctx->dma,
- migration_get_pgsize(vfu_ctx->migration));
- break;
+ ssize_t ret;
- case VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP:
- dma_controller_dirty_page_logging_stop(vfu_ctx->dma);
- ret = 0;
- break;
+ switch (operations) {
+ case VFIO_DEVICE_FEATURE_GET: {
+ if (is_migration_feature(feature)) {
+ ret = handle_migration_device_feature_get(vfu_ctx, msg, req);
+ } else if (is_dma_feature(feature)) {
+ ret = handle_dma_device_feature_get(vfu_ctx, msg, req);
+ } else {
+ vfu_log(vfu_ctx, LOG_ERR, "unsupported feature %d for GET",
+ feature);
+ return ERROR_INT(EINVAL);
+ }
+ break;
+ }
- case VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP:
- ret = handle_dirty_pages_get(vfu_ctx, msg);
- break;
+ case VFIO_DEVICE_FEATURE_SET: {
+ msg->out.iov.iov_len = msg->in.iov.iov_len;
- default:
- vfu_log(vfu_ctx, LOG_ERR, "bad flags %#x", dirty_pages->flags);
- ret = ERROR_INT(EINVAL);
- break;
+ if (req->argsz < msg->out.iov.iov_len) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad argsz (%d<%ld)", req->argsz,
+ msg->out.iov.iov_len);
+ iov_free(&msg->out.iov);
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_base = malloc(msg->out.iov.iov_len);
+
+ if (msg->out.iov.iov_base == NULL) {
+ return ERROR_INT(ENOMEM);
+ }
+
+ memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base,
+ msg->out.iov.iov_len);
+
+ struct vfio_user_device_feature *res = msg->out.iov.iov_base;
+
+ if (is_migration_feature(feature)) {
+ ret = handle_migration_device_feature_set(vfu_ctx, feature, res);
+ } else if (is_dma_feature(feature)) {
+ ret = handle_dma_device_feature_set(vfu_ctx, feature, res);
+ } else {
+ vfu_log(vfu_ctx, LOG_ERR, "unsupported feature %d for SET",
+ feature);
+ return ERROR_INT(EINVAL);
+ }
+ break;
+ }
+
+ default: {
+ /*
+ * PROBE allows GET/SET to also be set (to specify which operations
+ * we want to probe the feature for), so we only check that PROBE
+ * is set, not that it is the only operation flag set.
+ */
+ if (!(operations & VFIO_DEVICE_FEATURE_PROBE)) {
+ vfu_log(vfu_ctx, LOG_ERR, "no operation specified");
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_len = msg->in.iov.iov_len;
+
+ if (req->argsz < msg->out.iov.iov_len) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad argsz (%d<%ld)", req->argsz,
+ msg->out.iov.iov_len);
+ iov_free(&msg->out.iov);
+ return ERROR_INT(EINVAL);
+ }
+
+ msg->out.iov.iov_base = malloc(msg->out.iov.iov_len);
+
+ if (msg->out.iov.iov_base == NULL) {
+ return ERROR_INT(ENOMEM);
+ }
+
+ memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base,
+ msg->out.iov.iov_len);
+
+ ret = 0;
+ }
}
return ret;
@@ -1207,13 +1368,16 @@ handle_request(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
ret = device_reset(vfu_ctx, VFU_RESET_DEVICE);
break;
- case VFIO_USER_DIRTY_PAGES:
- // FIXME: don't allow migration calls if migration == NULL
- if (vfu_ctx->dma != NULL) {
- ret = handle_dirty_pages(vfu_ctx, msg);
- } else {
- ret = 0;
- }
+ case VFIO_USER_DEVICE_FEATURE:
+ ret = handle_device_feature(vfu_ctx, msg);
+ break;
+
+ case VFIO_USER_MIG_DATA_READ:
+ ret = handle_mig_data_read(vfu_ctx, msg);
+ break;
+
+ case VFIO_USER_MIG_DATA_WRITE:
+ ret = handle_mig_data_write(vfu_ctx, msg);
break;
default:
@@ -1317,7 +1481,8 @@ MOCK_DEFINE(cmd_allowed_when_stopped_and_copying)(uint16_t cmd)
{
return cmd == VFIO_USER_REGION_READ ||
cmd == VFIO_USER_REGION_WRITE ||
- cmd == VFIO_USER_DIRTY_PAGES;
+ cmd == VFIO_USER_DEVICE_FEATURE ||
+ cmd == VFIO_USER_MIG_DATA_READ;
}
bool
@@ -1343,14 +1508,14 @@ static bool
access_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index,
uint64_t offset)
{
- return access_migration_needs_quiesce(vfu_ctx, region_index, offset)
- || access_is_pci_cap_exp(vfu_ctx, region_index, offset);
+ return access_is_pci_cap_exp(vfu_ctx, region_index, offset);
}
static bool
command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg)
{
struct vfio_user_region_access *reg;
+ struct vfio_user_device_feature *feature;
if (vfu_ctx->quiesce == NULL) {
return false;
@@ -1364,22 +1529,11 @@ command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg)
case VFIO_USER_DEVICE_RESET:
return true;
- case VFIO_USER_DIRTY_PAGES: {
- struct vfio_user_dirty_pages *dirty_pages = msg->in.iov.iov_base;
-
- if (msg->in.iov.iov_len < sizeof(*dirty_pages)) {
- return false;
- }
-
- return !(dirty_pages->flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP);
- }
-
case VFIO_USER_REGION_WRITE:
if (msg->in.iov.iov_len < sizeof(*reg)) {
/*
* bad request, it will be eventually failed by
* handle_region_access
- *
*/
return false;
}
@@ -1388,8 +1542,23 @@ command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg)
return true;
}
break;
+
+ case VFIO_USER_DEVICE_FEATURE:
+ if (msg->in.iov.iov_len < sizeof(*feature)) {
+ /*
+ * bad request, it will be eventually failed by
+ * handle_region_access
+ */
+ return false;
+ }
+ feature = msg->in.iov.iov_base;
+ if (migration_feature_needs_quiesce(feature)) {
+ return true;
+ }
+ break;
}
+
return false;
}
@@ -1842,38 +2011,6 @@ copyin_mmap_areas(vfu_reg_info_t *reg_info,
return 0;
}
-static bool
-ranges_intersect(size_t off1, size_t size1, size_t off2, size_t size2)
-{
- /*
- * For two ranges to intersect, the start of each range must be before the
- * end of the other range.
- * TODO already defined in lib/pci_caps.c, maybe introduce a file for misc
- * utility functions?
- */
- return (off1 < (off2 + size2) && off2 < (off1 + size1));
-}
-
-static bool
-maps_over_migr_regs(struct iovec *iov)
-{
- return ranges_intersect(0, vfu_get_migr_register_area_size(),
- (size_t)iov->iov_base, iov->iov_len);
-}
-
-static bool
-validate_sparse_mmaps_for_migr_reg(vfu_reg_info_t *reg)
-{
- int i;
-
- for (i = 0; i < reg->nr_mmap_areas; i++) {
- if (maps_over_migr_regs(&reg->mmap_areas[i])) {
- return false;
- }
- }
- return true;
-}
-
EXPORT int
vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size,
vfu_region_access_cb_t *cb, int flags,
@@ -1919,12 +2056,6 @@ vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size,
return ERROR_INT(EINVAL);
}
- if (region_idx == VFU_PCI_DEV_MIGR_REGION_IDX &&
- size < vfu_get_migr_register_area_size()) {
- vfu_log(vfu_ctx, LOG_ERR, "invalid migration region size %zu", size);
- return ERROR_INT(EINVAL);
- }
-
for (i = 0; i < nr_mmap_areas; i++) {
struct iovec *iov = &mmap_areas[i];
if ((size_t)iov_end(iov) > size) {
@@ -1956,15 +2087,6 @@ vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size,
}
}
- if (region_idx == VFU_PCI_DEV_MIGR_REGION_IDX) {
- if (!validate_sparse_mmaps_for_migr_reg(reg)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "migration registers cannot be memory mapped");
- errno = EINVAL;
- goto err;
- }
- }
-
return 0;
err:
@@ -2044,26 +2166,20 @@ vfu_setup_irq_state_callback(vfu_ctx_t *vfu_ctx, enum vfu_dev_irq_type type,
EXPORT int
vfu_setup_device_migration_callbacks(vfu_ctx_t *vfu_ctx,
- const vfu_migration_callbacks_t *callbacks,
- uint64_t data_offset)
+ const vfu_migration_callbacks_t *callbacks)
{
int ret = 0;
assert(vfu_ctx != NULL);
assert(callbacks != NULL);
- if (vfu_ctx->reg_info[VFU_PCI_DEV_MIGR_REGION_IDX].size == 0) {
- vfu_log(vfu_ctx, LOG_ERR, "no device migration region");
- return ERROR_INT(EINVAL);
- }
-
if (callbacks->version != VFU_MIGR_CALLBACKS_VERS) {
vfu_log(vfu_ctx, LOG_ERR, "unsupported migration callbacks version %d",
callbacks->version);
return ERROR_INT(EINVAL);
}
- vfu_ctx->migration = init_migration(callbacks, data_offset, &ret);
+ vfu_ctx->migration = init_migration(callbacks, &ret);
if (vfu_ctx->migration == NULL) {
vfu_log(vfu_ctx, LOG_ERR, "failed to initialize device migration");
return ERROR_INT(ret);
diff --git a/lib/migration.c b/lib/migration.c
index 794e7b8..02c29c1 100644
--- a/lib/migration.c
+++ b/lib/migration.c
@@ -39,17 +39,100 @@
#include "private.h"
#include "migration_priv.h"
+/*
+ * This defines valid migration state transitions. Each element in the array
+ * corresponds to a FROM state and each bit of the element to a TO state. If the
+ * bit is set, then the transition is allowed.
+ *
+ * The indices of each state are those in the vfio_user_device_mig_state enum.
+ */
+static const char transitions[VFIO_USER_DEVICE_NUM_STATES] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = 0,
+ [VFIO_USER_DEVICE_STATE_STOP] = (1 << VFIO_USER_DEVICE_STATE_RUNNING) |
+ (1 << VFIO_USER_DEVICE_STATE_STOP_COPY) |
+ (1 << VFIO_USER_DEVICE_STATE_RESUMING),
+ [VFIO_USER_DEVICE_STATE_RUNNING] = (1 << VFIO_USER_DEVICE_STATE_STOP) |
+ (1 << VFIO_USER_DEVICE_STATE_PRE_COPY),
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = 1 << VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = 1 << VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = 0,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = (1 << VFIO_USER_DEVICE_STATE_RUNNING) |
+ (1 << VFIO_USER_DEVICE_STATE_STOP_COPY),
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = 0
+};
+
+/*
+ * The spec dictates that, if no direct transition is allowed, and the
+ * transition is not one of the explicitly disallowed ones (i.e. anything to
+ * ERROR, anything from ERROR, and STOP_COPY -> PRE_COPY), we should take the
+ * shortest allowed path.
+ *
+ * This can be indexed as `next_state[current][target] == next`. If next is
+ * ERROR, then the transition is not allowed.
+ */
+static const uint32_t
+next_state[VFIO_USER_DEVICE_NUM_STATES][VFIO_USER_DEVICE_NUM_STATES] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = { 0, 0, 0, 0, 0, 0, 0, 0 },
+ [VFIO_USER_DEVICE_STATE_STOP] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP_COPY,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_RESUMING,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_RUNNING] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_PRE_COPY,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP_COPY,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_RESUMING] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_RESUMING,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_STOP,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = { 0, 0, 0, 0, 0, 0, 0, 0 },
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = {
+ [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP_COPY,
+ [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_RUNNING,
+ [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_PRE_COPY,
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR,
+ },
+ [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = { 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
bool
MOCK_DEFINE(vfio_migr_state_transition_is_valid)(uint32_t from, uint32_t to)
{
- return migr_states[from].state & (1 << to);
-}
-
-EXPORT size_t
-vfu_get_migr_register_area_size(void)
-{
- return ROUND_UP(sizeof(struct vfio_user_migration_info),
- sysconf(_SC_PAGE_SIZE));
+ return from < VFIO_USER_DEVICE_NUM_STATES
+ && to < VFIO_USER_DEVICE_NUM_STATES
+ && (transitions[from] & (1 << to)) != 0;
}
/*
@@ -57,16 +140,10 @@ vfu_get_migr_register_area_size(void)
* in vfu_ctx_t.
*/
struct migration *
-init_migration(const vfu_migration_callbacks_t * callbacks,
- uint64_t data_offset, int *err)
+init_migration(const vfu_migration_callbacks_t *callbacks, int *err)
{
struct migration *migr;
- if (data_offset < vfu_get_migr_register_area_size()) {
- *err = EINVAL;
- return NULL;
- }
-
migr = calloc(1, sizeof(*migr));
if (migr == NULL) {
*err = ENOMEM;
@@ -81,15 +158,13 @@ init_migration(const vfu_migration_callbacks_t * callbacks,
migr->pgsize = sysconf(_SC_PAGESIZE);
/* FIXME this should be done in vfu_ctx_realize */
- migr->info.device_state = VFIO_DEVICE_STATE_V1_RUNNING;
- migr->data_offset = data_offset;
+ migr->state = VFIO_USER_DEVICE_STATE_RUNNING;
migr->callbacks = *callbacks;
if (migr->callbacks.transition == NULL ||
- migr->callbacks.get_pending_bytes == NULL ||
- migr->callbacks.prepare_data == NULL ||
migr->callbacks.read_data == NULL ||
- migr->callbacks.write_data == NULL) {
+ migr->callbacks.write_data == NULL ||
+ migr->callbacks.version != VFU_MIGR_CALLBACKS_VERS) {
free(migr);
*err = EINVAL;
return NULL;
@@ -100,35 +175,29 @@ init_migration(const vfu_migration_callbacks_t * callbacks,
void
MOCK_DEFINE(migr_state_transition)(struct migration *migr,
- enum migr_iter_state state)
+ enum vfio_user_device_mig_state state)
{
assert(migr != NULL);
- /* FIXME validate that state transition */
- migr->iter.state = state;
+ migr->state = state;
}
vfu_migr_state_t
-MOCK_DEFINE(migr_state_vfio_to_vfu)(uint32_t device_state)
+MOCK_DEFINE(migr_state_vfio_to_vfu)(uint32_t state)
{
- switch (device_state) {
- case VFIO_DEVICE_STATE_V1_STOP:
+ switch (state) {
+ case VFIO_USER_DEVICE_STATE_STOP:
return VFU_MIGR_STATE_STOP;
- case VFIO_DEVICE_STATE_V1_RUNNING:
+ case VFIO_USER_DEVICE_STATE_RUNNING:
return VFU_MIGR_STATE_RUNNING;
- case VFIO_DEVICE_STATE_V1_SAVING:
- /*
- * FIXME How should the device operate during the stop-and-copy
- * phase? Should we only allow the migration data to be read from
- * the migration region? E.g. Access to any other region should be
- * failed? This might be a good question to send to LKML.
- */
+ case VFIO_USER_DEVICE_STATE_STOP_COPY:
return VFU_MIGR_STATE_STOP_AND_COPY;
- case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING:
- return VFU_MIGR_STATE_PRE_COPY;
- case VFIO_DEVICE_STATE_V1_RESUMING:
+ case VFIO_USER_DEVICE_STATE_RESUMING:
return VFU_MIGR_STATE_RESUME;
+ case VFIO_USER_DEVICE_STATE_PRE_COPY:
+ return VFU_MIGR_STATE_PRE_COPY;
+ default:
+ return -1;
}
- return -1;
}
/**
@@ -165,8 +234,7 @@ MOCK_DEFINE(migr_trans_to_valid_state)(vfu_ctx_t *vfu_ctx, struct migration *mig
return ret;
}
}
- migr->info.device_state = device_state;
- migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_INITIAL);
+ migr_state_transition(migr, device_state);
return 0;
}
@@ -178,372 +246,176 @@ MOCK_DEFINE(handle_device_state)(vfu_ctx_t *vfu_ctx, struct migration *migr,
uint32_t device_state, bool notify)
{
+ assert(vfu_ctx != NULL);
assert(migr != NULL);
- if (!vfio_migr_state_transition_is_valid(migr->info.device_state,
- device_state)) {
+ if (!vfio_migr_state_transition_is_valid(migr->state, device_state)) {
return ERROR_INT(EINVAL);
}
return migr_trans_to_valid_state(vfu_ctx, migr, device_state, notify);
}
-/**
- * Returns 0 on success, -1 on error setting errno.
- */
-static ssize_t
-handle_pending_bytes(vfu_ctx_t *vfu_ctx, struct migration *migr,
- uint64_t *pending_bytes, bool is_write)
+size_t
+migration_get_state(vfu_ctx_t *vfu_ctx)
{
- assert(migr != NULL);
- assert(pending_bytes != NULL);
+ return vfu_ctx->migration->state;
+}
- if (is_write) {
+ssize_t
+migration_set_state(vfu_ctx_t *vfu_ctx, uint32_t device_state)
+{
+ struct migration *migr = vfu_ctx->migration;
+ uint32_t state;
+ ssize_t ret = 0;
+
+ if (device_state > VFIO_USER_DEVICE_NUM_STATES) {
return ERROR_INT(EINVAL);
}
+
+ while (migr->state != device_state && ret == 0) {
+ state = next_state[migr->state][device_state];
- if (migr->iter.state == VFIO_USER_MIGR_ITER_STATE_FINISHED) {
- *pending_bytes = 0;
- return 0;
- }
-
- switch (migr->iter.state) {
- case VFIO_USER_MIGR_ITER_STATE_INITIAL:
- case VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED:
- /*
- * FIXME what happens if data haven't been consumed in the previous
- * iteration? Check https://www.spinics.net/lists/kvm/msg228608.html.
- */
- *pending_bytes = migr->iter.pending_bytes = migr->callbacks.get_pending_bytes(vfu_ctx);
-
- if (*pending_bytes == 0) {
- migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_FINISHED);
- } else {
- migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_STARTED);
- }
- break;
- case VFIO_USER_MIGR_ITER_STATE_STARTED:
- /*
- * FIXME We might be wrong returning a cached value, check
- * https://www.spinics.net/lists/kvm/msg228608.html
- *
- */
- *pending_bytes = migr->iter.pending_bytes;
- break;
- default:
+ if (state == VFIO_USER_DEVICE_STATE_ERROR) {
return ERROR_INT(EINVAL);
- }
- return 0;
-}
+ }
-/*
- * FIXME reading or writing migration registers with the wrong device state or
- * out of sequence is undefined, but should not result in EINVAL, it should
- * simply be ignored. However this way it's easier to catch development errors.
- * Make this behavior conditional.
- */
+ ret = handle_device_state(vfu_ctx, migr, state, true);
+ };
+
+ return ret;
+}
-/**
- * Returns 0 on success, -1 on error setting errno.
- */
-static ssize_t
-handle_data_offset_when_saving(vfu_ctx_t *vfu_ctx, struct migration *migr,
- bool is_write)
+ssize_t
+handle_mig_data_read(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
{
- int ret = 0;
-
- assert(migr != NULL);
+ assert(vfu_ctx != NULL);
+ assert(msg != NULL);
- if (is_write) {
- vfu_log(vfu_ctx, LOG_ERR, "data_offset is RO when saving");
+ if (msg->in.iov.iov_len < sizeof(struct vfio_user_mig_data)) {
+ vfu_log(vfu_ctx, LOG_ERR, "message too short (%ld)",
+ msg->in.iov.iov_len);
return ERROR_INT(EINVAL);
}
- switch (migr->iter.state) {
- case VFIO_USER_MIGR_ITER_STATE_STARTED:
- ret = migr->callbacks.prepare_data(vfu_ctx, &migr->iter.offset,
- &migr->iter.size);
- if (ret != 0) {
- return ret;
- }
- /*
- * FIXME must first read data_offset and then data_size. They way we've
- * implemented it now, if data_size is read before data_offset we
- * transition to state VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED without
- * calling callbacks.prepare_data, which is wrong. Maybe we need
- * separate states for data_offset and data_size.
- */
- migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED);
- break;
- case VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED:
- /*
- * data_offset is invariant during a save iteration.
- */
- break;
- default:
- vfu_log(vfu_ctx, LOG_ERR,
- "reading data_offset out of sequence is undefined");
+ struct migration *migr = vfu_ctx->migration;
+ struct vfio_user_mig_data *req = msg->in.iov.iov_base;
+
+ if (vfu_ctx->migration == NULL) {
+ vfu_log(vfu_ctx, LOG_ERR, "migration not enabled");
return ERROR_INT(EINVAL);
}
- return 0;
-}
-
-/**
- * Returns 0 on success, -1 on error setting errno.
- */
-static ssize_t
-handle_data_offset(vfu_ctx_t *vfu_ctx, struct migration *migr,
- uint64_t *offset, bool is_write)
-{
- int ret;
-
- assert(migr != NULL);
- assert(offset != NULL);
-
- switch (migr->info.device_state) {
- case VFIO_DEVICE_STATE_V1_SAVING:
- case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING:
- ret = handle_data_offset_when_saving(vfu_ctx, migr, is_write);
- if (ret == 0 && !is_write) {
- *offset = migr->iter.offset + migr->data_offset;
- }
- return ret;
- case VFIO_DEVICE_STATE_V1_RESUMING:
- if (is_write) {
- /* TODO writing to read-only registers should be simply ignored */
- vfu_log(vfu_ctx, LOG_ERR, "bad write to migration data_offset");
- return ERROR_INT(EINVAL);
- }
- ret = migr->callbacks.prepare_data(vfu_ctx, offset, NULL);
- if (ret != 0) {
- return ret;
- }
- *offset += migr->data_offset;
- return 0;
+ if (migr->state != VFIO_USER_DEVICE_STATE_PRE_COPY
+ && migr->state != VFIO_USER_DEVICE_STATE_STOP_COPY) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad migration state to read data: %d",
+ migr->state);
+ return ERROR_INT(EINVAL);
}
- /* TODO improve error message */
- vfu_log(vfu_ctx, LOG_ERR,
- "bad access to migration data_offset in state %s",
- migr_states[migr->info.device_state].name);
- return ERROR_INT(EINVAL);
-}
-
-/**
- * Returns 0 on success, -1 on failure setting errno.
- */
-static ssize_t
-handle_data_size_when_saving(vfu_ctx_t *vfu_ctx, struct migration *migr,
- bool is_write)
-{
- assert(migr != NULL);
- if (is_write) {
- /* TODO improve error message */
- vfu_log(vfu_ctx, LOG_ERR, "data_size is RO when saving");
+ if (req->size > vfu_ctx->client_max_data_xfer_size) {
+ vfu_log(vfu_ctx, LOG_ERR, "transfer size exceeds limit (%d > %ld)",
+ req->size, vfu_ctx->client_max_data_xfer_size);
return ERROR_INT(EINVAL);
}
- if (migr->iter.state != VFIO_USER_MIGR_ITER_STATE_STARTED &&
- migr->iter.state != VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED) {
- vfu_log(vfu_ctx, LOG_ERR,
- "reading data_size ouf of sequence is undefined");
+ if (req->argsz < sizeof(struct vfio_user_mig_data) + req->size) {
+ vfu_log(vfu_ctx, LOG_ERR, "argsz too small (%d < %ld)",
+ req->argsz, sizeof(struct vfio_user_mig_data) + req->size);
return ERROR_INT(EINVAL);
}
- return 0;
-}
-/**
- * Returns 0 on success, -1 on error setting errno.
- */
-static ssize_t
-handle_data_size_when_resuming(vfu_ctx_t *vfu_ctx, struct migration *migr,
- uint64_t size, bool is_write)
-{
- assert(migr != NULL);
+ msg->out.iov.iov_len = msg->in.iov.iov_len + req->size;
+ msg->out.iov.iov_base = calloc(1, msg->out.iov.iov_len);
- if (is_write) {
- return migr->callbacks.data_written(vfu_ctx, size);
+ if (msg->out.iov.iov_base == NULL) {
+ return ERROR_INT(ENOMEM);
}
- return 0;
-}
-/**
- * Returns 0 on success, -1 on failure setting errno.
- */
-static ssize_t
-handle_data_size(vfu_ctx_t *vfu_ctx, struct migration *migr,
- uint64_t *size, bool is_write)
-{
- int ret;
+ struct vfio_user_mig_data *res = msg->out.iov.iov_base;
- assert(vfu_ctx != NULL);
- assert(size != NULL);
-
- switch (migr->info.device_state){
- case VFIO_DEVICE_STATE_V1_SAVING:
- case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING:
- ret = handle_data_size_when_saving(vfu_ctx, migr, is_write);
- if (ret == 0 && !is_write) {
- *size = migr->iter.size;
- }
+ ssize_t ret = migr->callbacks.read_data(vfu_ctx, &res->data, req->size);
+
+ if (ret < 0) {
+ vfu_log(vfu_ctx, LOG_ERR, "read_data callback failed, errno=%d", errno);
+ iov_free(&msg->out.iov);
return ret;
- case VFIO_DEVICE_STATE_V1_RESUMING:
- return handle_data_size_when_resuming(vfu_ctx, migr, *size, is_write);
}
- /* TODO improve error message */
- vfu_log(vfu_ctx, LOG_ERR, "bad access to data_size");
- return ERROR_INT(EINVAL);
+
+ res->size = ret;
+ res->argsz = sizeof(struct vfio_user_mig_data) + ret;
+
+ return 0;
}
-/**
- * Returns 0 on success, -1 on failure setting errno.
- */
ssize_t
-MOCK_DEFINE(migration_region_access_registers)(vfu_ctx_t *vfu_ctx, char *buf,
- size_t count, loff_t pos,
- bool is_write)
+handle_mig_data_write(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
{
+ assert(vfu_ctx != NULL);
+ assert(msg != NULL);
+
+ if (msg->in.iov.iov_len < sizeof(struct vfio_user_mig_data)) {
+ vfu_log(vfu_ctx, LOG_ERR, "message too short (%ld)",
+ msg->in.iov.iov_len);
+ return ERROR_INT(EINVAL);
+ }
+
struct migration *migr = vfu_ctx->migration;
- int ret;
- uint32_t *device_state, old_device_state;
+ struct vfio_user_mig_data *req = msg->in.iov.iov_base;
- assert(migr != NULL);
+ if (vfu_ctx->migration == NULL) {
+ vfu_log(vfu_ctx, LOG_ERR, "migration not enabled");
+ return ERROR_INT(EINVAL);
+ }
- switch (pos) {
- case offsetof(struct vfio_user_migration_info, device_state):
- if (count != sizeof(migr->info.device_state)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "bad device_state access size %zu", count);
- return ERROR_INT(EINVAL);
- }
- device_state = (uint32_t *)buf;
- if (!is_write) {
- *device_state = migr->info.device_state;
- return 0;
- }
- old_device_state = migr->info.device_state;
- vfu_log(vfu_ctx, LOG_DEBUG,
- "migration: transitioning from state %s to state %s",
- migr_states[old_device_state].name,
- migr_states[*device_state].name);
-
- ret = handle_device_state(vfu_ctx, migr, *device_state, true);
- if (ret == 0) {
- vfu_log(vfu_ctx, LOG_DEBUG,
- "migration: transitioned from state %s to state %s",
- migr_states[old_device_state].name,
- migr_states[*device_state].name);
- } else {
- vfu_log(vfu_ctx, LOG_ERR,
- "migration: failed to transition from state %s to state %s",
- migr_states[old_device_state].name,
- migr_states[*device_state].name);
- }
- break;
- case offsetof(struct vfio_user_migration_info, pending_bytes):
- if (count != sizeof(migr->info.pending_bytes)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "bad pending_bytes access size %zu", count);
- return ERROR_INT(EINVAL);
- }
- ret = handle_pending_bytes(vfu_ctx, migr, (uint64_t *)buf, is_write);
- break;
- case offsetof(struct vfio_user_migration_info, data_offset):
- if (count != sizeof(migr->info.data_offset)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "bad data_offset access size %zu", count);
- return ERROR_INT(EINVAL);
- }
- ret = handle_data_offset(vfu_ctx, migr, (uint64_t *)buf, is_write);
- break;
- case offsetof(struct vfio_user_migration_info, data_size):
- if (count != sizeof(migr->info.data_size)) {
- vfu_log(vfu_ctx, LOG_ERR,
- "bad data_size access size %zu", count);
- return ERROR_INT(EINVAL);
- }
- ret = handle_data_size(vfu_ctx, migr, (uint64_t *)buf, is_write);
- break;
- default:
- vfu_log(vfu_ctx, LOG_ERR,
- "bad migration region register offset %#llx",
- (ull_t)pos);
+ if (migr->state != VFIO_USER_DEVICE_STATE_RESUMING) {
+ vfu_log(vfu_ctx, LOG_ERR, "bad migration state to write data: %d",
+ migr->state);
return ERROR_INT(EINVAL);
}
- return ret;
-}
-ssize_t
-migration_region_access(vfu_ctx_t *vfu_ctx, char *buf, size_t count,
- loff_t pos, bool is_write)
-{
- struct migration *migr = vfu_ctx->migration;
- ssize_t ret;
+ if (req->size > vfu_ctx->client_max_data_xfer_size) {
+ vfu_log(vfu_ctx, LOG_ERR, "transfer size exceeds limit (%d > %ld)",
+ req->size, vfu_ctx->client_max_data_xfer_size);
+ return ERROR_INT(EINVAL);
+ }
- assert(migr != NULL);
- assert(buf != NULL);
+ if (req->argsz < sizeof(struct vfio_user_mig_data) + req->size) {
+ vfu_log(vfu_ctx, LOG_ERR, "argsz too small (%d < %ld)",
+ req->argsz, sizeof(struct vfio_user_mig_data) + req->size);
+ return ERROR_INT(EINVAL);
+ }
- /*
- * FIXME don't call the device callback if the migration state is in not in
- * pre-copy/stop-and-copy/resuming state, since the behavior is undefined
- * in that case.
- */
+ if (msg->in.iov.iov_len < sizeof(struct vfio_user_mig_data) + req->size) {
+ vfu_log(vfu_ctx, LOG_ERR, "short write (%d < %ld)",
+ req->argsz, sizeof(struct vfio_user_mig_data) + req->size);
+ return ERROR_INT(EINVAL);
+ }
- if (pos + count <= sizeof(struct vfio_user_migration_info)) {
- ret = migration_region_access_registers(vfu_ctx, buf, count,
- pos, is_write);
- if (ret != 0) {
- return ret;
- }
- } else {
-
- if (pos < (loff_t)migr->data_offset) {
- /*
- * TODO we can simply ignore the access to that part and handle
- * any access to the data region properly.
- */
- vfu_log(vfu_ctx, LOG_WARNING,
- "bad access to dead space %#llx - %#llx in migration region",
- (ull_t)pos,
- (ull_t)(pos + count - 1));
- return ERROR_INT(EINVAL);
- }
+ ssize_t ret = migr->callbacks.write_data(vfu_ctx, &req->data, req->size);
- pos -= migr->data_offset;
- if (is_write) {
- ret = migr->callbacks.write_data(vfu_ctx, buf, count, pos);
- if (ret < 0) {
- return -1;
- }
- } else {
- /*
- * FIXME <linux/vfio.h> says:
- *
- * d. Read data_size bytes of data from (region + data_offset) from the
- * migration region.
- *
- * Does this mean that partial reads are not allowed?
- */
- ret = migr->callbacks.read_data(vfu_ctx, buf, count, pos);
- if (ret < 0) {
- return -1;
- }
- }
+ if (ret < 0) {
+ vfu_log(vfu_ctx, LOG_ERR, "write_data callback failed, errno=%d",
+ errno);
+ return ret;
+ } else if (ret != req->size) {
+ vfu_log(vfu_ctx, LOG_ERR, "migration data partial write of size=%ld",
+ ret);
+ return ERROR_INT(EINVAL);
}
- return count;
+ return 0;
}
bool
MOCK_DEFINE(device_is_stopped_and_copying)(struct migration *migr)
{
- return migr != NULL && migr->info.device_state == VFIO_DEVICE_STATE_V1_SAVING;
+ return migr != NULL && migr->state == VFIO_USER_DEVICE_STATE_STOP_COPY;
}
bool
MOCK_DEFINE(device_is_stopped)(struct migration *migr)
{
- return migr != NULL && migr->info.device_state == VFIO_DEVICE_STATE_V1_STOP;
+ return migr != NULL && migr->state == VFIO_USER_DEVICE_STATE_STOP;
}
size_t
@@ -569,17 +441,11 @@ migration_set_pgsize(struct migration *migr, size_t pgsize)
}
bool
-access_migration_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index,
- uint64_t offset)
+migration_feature_needs_quiesce(struct vfio_user_device_feature *feature)
{
- /*
- * Writing to the migration state register with an unaligned access won't
- * trigger this check but that's not a problem because
- * migration_region_access_registers will fail the access.
- */
- return region_index == VFU_PCI_DEV_MIGR_REGION_IDX
- && vfu_ctx->migration != NULL
- && offset == offsetof(struct vfio_user_migration_info, device_state);
+ return ((feature->flags &
+ (VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE)) != 0)
+ && !(feature->flags & VFIO_DEVICE_FEATURE_PROBE);
}
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/migration.h b/lib/migration.h
index 26fd744..928a7e5 100644
--- a/lib/migration.h
+++ b/lib/migration.h
@@ -45,12 +45,19 @@
#include "private.h"
struct migration *
-init_migration(const vfu_migration_callbacks_t *callbacks,
- uint64_t data_offset, int *err);
+init_migration(const vfu_migration_callbacks_t *callbacks, int *err);
+
+size_t
+migration_get_state(vfu_ctx_t *vfu_ctx);
+
+ssize_t
+migration_set_state(vfu_ctx_t *vfu_ctx, uint32_t device_state);
ssize_t
-migration_region_access(vfu_ctx_t *vfu_ctx, char *buf, size_t count,
- loff_t pos, bool is_write);
+handle_mig_data_read(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg);
+
+ssize_t
+handle_mig_data_write(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg);
bool
migration_available(vfu_ctx_t *vfu_ctx);
@@ -65,6 +72,12 @@ migration_get_pgsize(struct migration *migr);
int
migration_set_pgsize(struct migration *migr, size_t pgsize);
+uint64_t
+migration_get_flags(struct migration *migr);
+
+MOCK_DECLARE(void, migr_state_transition, struct migration *migr,
+ enum vfio_user_device_mig_state state);
+
MOCK_DECLARE(bool, vfio_migr_state_transition_is_valid, uint32_t from,
uint32_t to);
@@ -72,8 +85,7 @@ MOCK_DECLARE(ssize_t, handle_device_state, vfu_ctx_t *vfu_ctx,
struct migration *migr, uint32_t device_state, bool notify);
bool
-access_migration_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index,
- uint64_t offset);
+migration_feature_needs_quiesce(struct vfio_user_device_feature *feature);
#endif /* LIB_VFIO_USER_MIGRATION_H */
diff --git a/lib/migration_priv.h b/lib/migration_priv.h
index d5643af..83c5f7e 100644
--- a/lib/migration_priv.h
+++ b/lib/migration_priv.h
@@ -33,94 +33,12 @@
#include <linux/vfio.h>
-/*
- * FSM to simplify saving device state.
- */
-enum migr_iter_state {
- VFIO_USER_MIGR_ITER_STATE_INITIAL,
- VFIO_USER_MIGR_ITER_STATE_STARTED,
- VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED,
- VFIO_USER_MIGR_ITER_STATE_FINISHED
-};
-
struct migration {
- /*
- * TODO if the user provides an FD then should mmap it and use the migration
- * registers in the file
- */
- struct vfio_user_migration_info info;
+ enum vfio_user_device_mig_state state;
size_t pgsize;
vfu_migration_callbacks_t callbacks;
- uint64_t data_offset;
-
- /*
- * This is only for the saving state. The resuming state is simpler so we
- * don't need it.
- */
- struct {
- enum migr_iter_state state;
- uint64_t pending_bytes;
- uint64_t offset;
- uint64_t size;
- } iter;
-};
-
-struct migr_state_data {
- uint32_t state;
- const char *name;
-};
-
-#define VFIO_DEVICE_STATE_V1_ERROR (VFIO_DEVICE_STATE_V1_SAVING | VFIO_DEVICE_STATE_V1_RESUMING)
-
-/* valid migration state transitions */
-static const struct migr_state_data migr_states[(VFIO_DEVICE_STATE_MASK + 1)] = {
- [VFIO_DEVICE_STATE_V1_STOP] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_STOP) |
- (1 << VFIO_DEVICE_STATE_V1_RUNNING),
- .name = "stopped"
- },
- [VFIO_DEVICE_STATE_V1_RUNNING] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_STOP) |
- (1 << VFIO_DEVICE_STATE_V1_RUNNING) |
- (1 << VFIO_DEVICE_STATE_V1_SAVING) |
- (1 << (VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING)) |
- (1 << VFIO_DEVICE_STATE_V1_RESUMING) |
- (1 << VFIO_DEVICE_STATE_V1_ERROR),
- .name = "running"
- },
- [VFIO_DEVICE_STATE_V1_SAVING] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_STOP) |
- (1 << VFIO_DEVICE_STATE_V1_RUNNING) |
- (1 << VFIO_DEVICE_STATE_V1_SAVING) |
- (1 << VFIO_DEVICE_STATE_V1_ERROR),
- .name = "stop-and-copy"
- },
- [VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_STOP) |
- (1 << VFIO_DEVICE_STATE_V1_SAVING) |
- (1 << VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING) |
- (1 << VFIO_DEVICE_STATE_V1_ERROR),
- .name = "pre-copy"
- },
- [VFIO_DEVICE_STATE_V1_RESUMING] = {
- .state =
- (1 << VFIO_DEVICE_STATE_V1_RUNNING) |
- (1 << VFIO_DEVICE_STATE_V1_RESUMING) |
- (1 << VFIO_DEVICE_STATE_V1_ERROR),
- .name = "resuming"
- }
};
-MOCK_DECLARE(ssize_t, migration_region_access_registers, vfu_ctx_t *vfu_ctx,
- char *buf, size_t count, loff_t pos, bool is_write);
-
-MOCK_DECLARE(void, migr_state_transition, struct migration *migr,
- enum migr_iter_state state);
-
MOCK_DECLARE(vfu_migr_state_t, migr_state_vfio_to_vfu, uint32_t device_state);
MOCK_DECLARE(int, state_trans_notify, vfu_ctx_t *vfu_ctx,
@@ -129,4 +47,4 @@ MOCK_DECLARE(int, state_trans_notify, vfu_ctx_t *vfu_ctx,
#endif
-/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ \ No newline at end of file
diff --git a/lib/private.h b/lib/private.h
index fdd804f..6e0170e 100644
--- a/lib/private.h
+++ b/lib/private.h
@@ -195,20 +195,6 @@ typedef struct ioeventfd {
LIST_ENTRY(ioeventfd) entry;
} ioeventfd_t;
-static inline int
-ERROR_INT(int err)
-{
- errno = err;
- return -1;
-}
-
-static inline void *
-ERROR_PTR(int err)
-{
- errno = err;
- return NULL;
-}
-
int
consume_fd(int *fds, size_t nr_fds, size_t index);
diff --git a/samples/client.c b/samples/client.c
index ed66a30..e8b737f 100644
--- a/samples/client.c
+++ b/samples/client.c
@@ -36,6 +36,7 @@
#include <errno.h>
#include <sys/mman.h>
#include <sys/eventfd.h>
+#include <sys/param.h>
#include <time.h>
#include <err.h>
#include <assert.h>
@@ -63,6 +64,8 @@ static char const *irq_to_str[] = {
[VFU_DEV_REQ_IRQ] = "REQ"
};
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
struct client_dma_region {
/*
* Our DMA regions are one page in size so we only need one bit to mark them as
@@ -121,12 +124,9 @@ send_version(int sock)
"{"
"\"capabilities\":{"
"\"max_msg_fds\":%u,"
- "\"max_data_xfer_size\":%u,"
- "\"migration\":{"
- "\"pgsize\":%ld"
- "}"
+ "\"max_data_xfer_size\":%u"
"}"
- "}", CLIENT_MAX_FDS, CLIENT_MAX_DATA_XFER_SIZE, sysconf(_SC_PAGESIZE));
+ "}", CLIENT_MAX_FDS, CLIENT_MAX_DATA_XFER_SIZE);
cversion.major = LIB_VFIO_USER_MAJOR;
cversion.minor = LIB_VFIO_USER_MINOR;
@@ -225,14 +225,11 @@ send_device_reset(int sock)
}
}
-/* returns whether a VFIO migration capability is found */
-static bool
+static void
get_region_vfio_caps(struct vfio_info_cap_header *header,
struct vfio_region_info_cap_sparse_mmap **sparse)
{
- struct vfio_region_info_cap_type *type;
unsigned int i;
- bool migr = false;
while (true) {
switch (header->id) {
@@ -247,16 +244,6 @@ get_region_vfio_caps(struct vfio_info_cap_header *header,
(ull_t)(*sparse)->areas[i].size);
}
break;
- case VFIO_REGION_INFO_CAP_TYPE:
- type = (struct vfio_region_info_cap_type*)header;
- if (type->type != VFIO_REGION_TYPE_MIGRATION ||
- type->subtype != VFIO_REGION_SUBTYPE_MIGRATION) {
- errx(EXIT_FAILURE, "bad region type %d/%d", type->type,
- type->subtype);
- }
- migr = true;
- printf("client: migration region\n");
- break;
default:
errx(EXIT_FAILURE, "bad VFIO cap ID %#x", header->id);
}
@@ -265,7 +252,6 @@ get_region_vfio_caps(struct vfio_info_cap_header *header,
}
header = (struct vfio_info_cap_header*)((char*)header + header->next - sizeof(struct vfio_region_info));
}
- return migr;
}
static void
@@ -281,7 +267,7 @@ do_get_device_region_info(int sock, struct vfio_region_info *region_info,
}
static void
-mmap_sparse_areas(int *fds, struct vfio_region_info *region_info,
+mmap_sparse_areas(int fd, struct vfio_region_info *region_info,
struct vfio_region_info_cap_sparse_mmap *sparse)
{
size_t i;
@@ -293,14 +279,14 @@ mmap_sparse_areas(int *fds, struct vfio_region_info *region_info,
char pathname[PATH_MAX];
char buf[PATH_MAX] = "";
- ret = snprintf(pathname, sizeof(pathname), "/proc/self/fd/%d", fds[i]);
+ ret = snprintf(pathname, sizeof(pathname), "/proc/self/fd/%d", fd);
assert(ret != -1 && (size_t)ret < sizeof(pathname));
ret = readlink(pathname, buf, sizeof(buf) - 1);
if (ret == -1) {
- err(EXIT_FAILURE, "failed to resolve file descriptor %d", fds[i]);
+ err(EXIT_FAILURE, "failed to resolve file descriptor %d", fd);
}
addr = mmap(NULL, sparse->areas[i].size, PROT_READ | PROT_WRITE,
- MAP_SHARED, fds[i], region_info->offset +
+ MAP_SHARED, fd, region_info->offset +
sparse->areas[i].offset);
if (addr == MAP_FAILED) {
err(EXIT_FAILURE,
@@ -357,16 +343,15 @@ get_device_region_info(int sock, uint32_t index)
nr_fds);
if (cap_sz) {
struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
- if (get_region_vfio_caps((struct vfio_info_cap_header*)(region_info + 1),
- &sparse)) {
- if (sparse != NULL) {
- assert((index == VFU_PCI_DEV_BAR1_REGION_IDX && nr_fds == 2) ||
- (index == VFU_PCI_DEV_MIGR_REGION_IDX && nr_fds == 1));
- assert(nr_fds == sparse->nr_areas);
- mmap_sparse_areas(fds, region_info, sparse);
- }
+ get_region_vfio_caps((struct vfio_info_cap_header*)(region_info + 1),
+ &sparse);
+
+ if (sparse != NULL) {
+ assert(index == VFU_PCI_DEV_BAR1_REGION_IDX && nr_fds == 1);
+ mmap_sparse_areas(fds[0], region_info, sparse);
+ } else {
+ assert(index != VFU_PCI_DEV_BAR1_REGION_IDX);
}
-
}
free(region_info);
}
@@ -399,7 +384,7 @@ get_device_info(int sock, struct vfio_user_device_info *dev_info)
err(EXIT_FAILURE, "failed to get device info");
}
- if (dev_info->num_regions != 10) {
+ if (dev_info->num_regions != 9) {
errx(EXIT_FAILURE, "bad number of device regions %d",
dev_info->num_regions);
}
@@ -484,7 +469,6 @@ access_region(int sock, int region, bool is_write, uint64_t offset,
.iov_len = data_len
}
};
- static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
struct vfio_user_region_access *recv_data;
size_t nr_send_iovecs, recv_data_len;
int op, ret;
@@ -539,6 +523,123 @@ access_region(int sock, int region, bool is_write, uint64_t offset,
return 0;
}
+static int
+set_migration_state(int sock, uint32_t state)
+{
+ static int msg_id = 0xfab1;
+ struct vfio_user_device_feature req = {
+ .argsz = sizeof(struct vfio_user_device_feature)
+ + sizeof(struct vfio_user_device_feature_mig_state),
+ .flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
+ };
+ struct vfio_user_device_feature_mig_state change_state = {
+ .device_state = state,
+ .data_fd = -1
+ };
+ struct iovec send_iovecs[3] = {
+ [1] = {
+ .iov_base = &req,
+ .iov_len = sizeof(req)
+ },
+ [2] = {
+ .iov_base = &change_state,
+ .iov_len = sizeof(change_state)
+ }
+ };
+ void *response = alloca(sizeof(req) + sizeof(change_state));
+
+ if (response == NULL) {
+ return -1;
+ }
+
+ pthread_mutex_lock(&mutex);
+ int ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_DEVICE_FEATURE,
+ send_iovecs, 3, NULL, 0, NULL,
+ response, sizeof(req) + sizeof(change_state),
+ NULL, 0);
+ pthread_mutex_unlock(&mutex);
+
+ if (ret < 0) {
+ err(EXIT_FAILURE, "failed to set state: %d", ret);
+ }
+
+ if (memcmp(&req, response, sizeof(req)) != 0) {
+ err(EXIT_FAILURE, "invalid response to set_migration_state (header)");
+ }
+
+ if (memcmp(&change_state, response + sizeof(req),
+ sizeof(change_state)) != 0) {
+ err(EXIT_FAILURE, "invalid response to set_migration_state (payload)");
+ }
+
+ return ret;
+}
+
+static ssize_t
+read_migr_data(int sock, void *buf, size_t len)
+{
+ static int msg_id = 0x6904;
+ struct vfio_user_mig_data req = {
+ .argsz = sizeof(struct vfio_user_mig_data) + len,
+ .size = len
+ };
+ struct iovec send_iovecs[2] = {
+ [1] = {
+ .iov_base = &req,
+ .iov_len = sizeof(req)
+ }
+ };
+ struct vfio_user_mig_data *res = calloc(1, sizeof(req) + len);
+
+ assert(res != NULL);
+
+ pthread_mutex_lock(&mutex);
+ ssize_t ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_MIG_DATA_READ,
+ send_iovecs, 2, NULL, 0, NULL,
+ res, sizeof(req) + len, NULL, 0);
+ pthread_mutex_unlock(&mutex);
+
+ if (ret < 0) {
+ err(EXIT_FAILURE, "failed to read migration data: %ld", ret);
+ }
+
+ memcpy(buf, res->data, res->size);
+
+ ssize_t size = res->size;
+
+ free(res);
+
+ return size;
+}
+
+static ssize_t
+write_migr_data(int sock, void *buf, size_t len)
+{
+ static int msg_id = 0x2023;
+ struct vfio_user_mig_data req = {
+ .argsz = sizeof(struct vfio_user_mig_data) + len,
+ .size = len
+ };
+ struct iovec send_iovecs[3] = {
+ [1] = {
+ .iov_base = &req,
+ .iov_len = sizeof(req)
+ },
+ [2] = {
+ .iov_base = buf,
+ .iov_len = len
+ }
+ };
+
+ pthread_mutex_lock(&mutex);
+ ssize_t ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_MIG_DATA_WRITE,
+ send_iovecs, 3, NULL, 0, NULL,
+ &req, sizeof(req), NULL, 0);
+ pthread_mutex_unlock(&mutex);
+
+ return ret;
+}
+
static void
access_bar0(int sock, time_t *t)
{
@@ -712,34 +813,33 @@ static void
get_dirty_bitmap(int sock, struct client_dma_region *dma_region,
bool expect_dirty)
{
- uint64_t bitmap_size = _get_bitmap_size(dma_region->map.size,
- sysconf(_SC_PAGESIZE));
- struct vfio_user_dirty_pages *dirty_pages;
- struct vfio_user_bitmap_range *range;
+ struct vfio_user_device_feature *res;
+ struct vfio_user_device_feature_dma_logging_report *report;
char *bitmap;
- size_t size;
- void *data;
int ret;
- size = sizeof(*dirty_pages) + sizeof(*range) + bitmap_size;
+ uint64_t bitmap_size = get_bitmap_size(dma_region->map.size,
+ sysconf(_SC_PAGESIZE));
- data = calloc(1, size);
+ size_t size = sizeof(*res) + sizeof(*report) + bitmap_size;
+
+ void *data = calloc(1, size);
assert(data != NULL);
- dirty_pages = data;
- dirty_pages->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
- dirty_pages->argsz = sizeof(*dirty_pages) + sizeof(*range) + bitmap_size;
+ res = data;
+ res->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT
+ | VFIO_DEVICE_FEATURE_GET;
+ res->argsz = size;
- range = data + sizeof(*dirty_pages);
- range->iova = dma_region->map.addr;
- range->size = dma_region->map.size;
- range->bitmap.size = bitmap_size;
- range->bitmap.pgsize = sysconf(_SC_PAGESIZE);
+ report = (struct vfio_user_device_feature_dma_logging_report *)(res + 1);
+ report->iova = dma_region->map.addr;
+ report->length = dma_region->map.size;
+ report->page_size = sysconf(_SC_PAGESIZE);
- bitmap = data + sizeof(*dirty_pages) + sizeof(*range);
+ bitmap = data + sizeof(*res) + sizeof(*report);
- ret = tran_sock_msg(sock, 0x99, VFIO_USER_DIRTY_PAGES,
- data, sizeof(*dirty_pages) + sizeof(*range),
+ ret = tran_sock_msg(sock, 0x99, VFIO_USER_DEVICE_FEATURE,
+ data, sizeof(*res) + sizeof(*report),
NULL, data, size);
if (ret != 0) {
err(EXIT_FAILURE, "failed to get dirty page bitmap");
@@ -749,14 +849,14 @@ get_dirty_bitmap(int sock, struct client_dma_region *dma_region,
char dirtied_by_client = (dma_region->flags & CLIENT_DIRTY_DMA_REGION) != 0;
char dirtied = dirtied_by_server | dirtied_by_client;
- printf("client: %s: %#llx-%#llx\t%#x\n", __func__,
- (ull_t)range->iova,
- (ull_t)(range->iova + range->size - 1), dirtied);
-
if (expect_dirty) {
assert(dirtied);
}
+ printf("client: %s: %#llx-%#llx\t%#x\n", __func__,
+ (ull_t)report->iova,
+ (ull_t)(report->iova + report->length - 1), dirtied);
+
free(data);
}
@@ -782,64 +882,32 @@ usage(char *argv0)
* @returns the number of iterations performed
*/
static size_t
-do_migrate(int sock, size_t nr_iters, struct iovec *migr_iter)
+do_migrate(int sock, size_t nr_iters, size_t max_iter_size,
+ struct iovec *migr_iter)
{
- int ret;
- uint64_t pending_bytes, data_offset, data_size;
+ ssize_t ret;
size_t i = 0;
- assert(nr_iters > 0);
-
- /* XXX read pending_bytes */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, pending_bytes),
- &pending_bytes, sizeof(pending_bytes));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read pending_bytes");
- }
-
- for (i = 0; i < nr_iters && pending_bytes > 0; i++) {
-
- /* XXX read data_offset and data_size */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, data_offset),
- &data_offset, sizeof(data_offset));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read data_offset");
- }
+ for (i = 0; i < nr_iters; i++) {
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, data_size),
- &data_size, sizeof(data_size));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read data_size");
- }
+ migr_iter[i].iov_len = max_iter_size;
+ migr_iter[i].iov_base = malloc(migr_iter[i].iov_len);
- migr_iter[i].iov_len = data_size;
- migr_iter[i].iov_base = malloc(data_size);
if (migr_iter[i].iov_base == NULL) {
err(EXIT_FAILURE, "failed to allocate migration buffer");
}
/* XXX read migration data */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- data_offset,
- (char *)migr_iter[i].iov_base, data_size);
+ ret = read_migr_data(sock, migr_iter[i].iov_base, migr_iter[i].iov_len);
if (ret < 0) {
err(EXIT_FAILURE, "failed to read migration data");
}
- /* FIXME send migration data to the destination client process */
+ migr_iter[i].iov_len = ret;
- /*
- * XXX read pending_bytes again to indicate to the server that the
- * migration data have been consumed.
- */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, pending_bytes),
- &pending_bytes, sizeof(pending_bytes));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read pending_bytes");
+ // We know we've finished transferring data when we read 0 bytes.
+ if (ret == 0) {
+ break;
}
}
return i;
@@ -883,11 +951,12 @@ fake_guest(void *arg)
static size_t
migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters,
- uint32_t *crcp, size_t bar1_size)
+ uint32_t *crcp, size_t bar1_size, size_t max_iter_size)
{
+ size_t expected_data;
uint32_t device_state;
+ size_t iters;
int ret;
- size_t _nr_iters;
pthread_t thread;
struct fake_guest_data fake_guest_data = {
.sock = sock,
@@ -902,7 +971,9 @@ migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters,
err(EXIT_FAILURE, "failed to create pthread");
}
- *nr_iters = 2;
+ expected_data = bar1_size;
+ *nr_iters = (expected_data + max_iter_size - 1) / max_iter_size;
+ assert(*nr_iters == 12);
*migr_iters = malloc(sizeof(struct iovec) * *nr_iters);
if (*migr_iters == NULL) {
err(EXIT_FAILURE, NULL);
@@ -912,16 +983,15 @@ migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters,
* XXX set device state to pre-copy. This is technically optional but any
* VMM that cares about performance needs this.
*/
- device_state = VFIO_DEVICE_STATE_V1_SAVING | VFIO_DEVICE_STATE_V1_RUNNING;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ device_state = VFIO_USER_DEVICE_STATE_PRE_COPY;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to device state");
}
- _nr_iters = do_migrate(sock, 1, *migr_iters);
- assert(_nr_iters == 1);
+ iters = do_migrate(sock, *nr_iters, max_iter_size, *migr_iters);
+ assert(iters == *nr_iters);
+
printf("client: stopping fake guest thread\n");
fake_guest_data.done = true;
__sync_synchronize();
@@ -933,31 +1003,32 @@ migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters,
printf("client: setting device state to stop-and-copy\n");
- device_state = VFIO_DEVICE_STATE_V1_SAVING;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ device_state = VFIO_USER_DEVICE_STATE_STOP_COPY;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to device state");
}
- _nr_iters += do_migrate(sock, 1, (*migr_iters) + _nr_iters);
- if (_nr_iters != 2) {
- errx(EXIT_FAILURE,
- "expected 2 iterations instead of %zu while in stop-and-copy state",
- _nr_iters);
+ expected_data = bar1_size + sizeof(time_t);
+ *nr_iters = (expected_data + max_iter_size - 1) / max_iter_size;
+ assert(*nr_iters == 13);
+ free(*migr_iters);
+ *migr_iters = malloc(sizeof(struct iovec) * *nr_iters);
+ if (*migr_iters == NULL) {
+ err(EXIT_FAILURE, NULL);
}
+ iters = do_migrate(sock, *nr_iters, max_iter_size, *migr_iters);
+ assert(iters == *nr_iters);
+
/* XXX read device state, migration must have finished now */
- device_state = VFIO_DEVICE_STATE_V1_STOP;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ device_state = VFIO_USER_DEVICE_STATE_STOP;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to device state");
}
- return _nr_iters;
+ return iters;
}
static int
@@ -966,11 +1037,11 @@ migrate_to(char *old_sock_path, int *server_max_fds,
struct iovec *migr_iters, char *path_to_server,
uint32_t src_crc, size_t bar1_size)
{
- int ret, sock;
+ ssize_t ret;
+ int sock;
char *sock_path;
struct stat sb;
- uint32_t device_state = VFIO_DEVICE_STATE_V1_RESUMING;
- uint64_t data_offset, data_len;
+ uint32_t device_state = VFIO_USER_DEVICE_STATE_RESUMING;
size_t i;
uint32_t dst_crc;
char buf[bar1_size];
@@ -1020,57 +1091,26 @@ migrate_to(char *old_sock_path, int *server_max_fds,
negotiate(sock, server_max_fds, server_max_data_xfer_size, pgsize);
- /* XXX set device state to resuming */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ device_state = VFIO_USER_DEVICE_STATE_RESUMING;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to set device state to resuming");
}
for (i = 0; i < nr_iters; i++) {
-
- /* XXX read data offset */
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false,
- offsetof(struct vfio_user_migration_info, data_offset),
- &data_offset, sizeof(data_offset));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to read migration data offset");
- }
-
/* XXX write migration data */
-
- /*
- * TODO write half of migration data via regular write and other half via
- * memopy map.
- */
- printf("client: writing migration device data %#llx-%#llx\n",
- (ull_t)data_offset,
- (ull_t)(data_offset + migr_iters[i].iov_len - 1));
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- data_offset, migr_iters[i].iov_base,
- migr_iters[i].iov_len);
+ ret = write_migr_data(sock, migr_iters[i].iov_base,
+ migr_iters[i].iov_len);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write device migration data");
}
-
- /* XXX write data_size */
- data_len = migr_iters[i].iov_len;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, data_size),
- &data_len, sizeof(data_len));
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to write migration data size");
- }
}
- /* XXX set device state to running */
- device_state = VFIO_DEVICE_STATE_V1_RUNNING;
- ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true,
- offsetof(struct vfio_user_migration_info, device_state),
- &device_state, sizeof(device_state));
+ /* XXX set device state to stop to finish the transfer */
+ device_state = VFIO_USER_DEVICE_STATE_STOP;
+ ret = set_migration_state(sock, device_state);
if (ret < 0) {
- err(EXIT_FAILURE, "failed to set device state to running");
+ err(EXIT_FAILURE, "failed to set device state to stop");
}
/* validate contents of BAR1 */
@@ -1086,6 +1126,13 @@ migrate_to(char *old_sock_path, int *server_max_fds,
abort();
}
+ /* XXX set device state to running */
+ device_state = VFIO_USER_DEVICE_STATE_RUNNING;
+ ret = set_migration_state(sock, device_state);
+ if (ret < 0) {
+ err(EXIT_FAILURE, "failed to set device state to running");
+ }
+
return sock;
}
@@ -1125,7 +1172,6 @@ int main(int argc, char *argv[])
size_t server_max_data_xfer_size;
size_t pgsize;
int nr_dma_regions;
- struct vfio_user_dirty_pages dirty_pages = {0};
int opt;
time_t t;
char *path_to_server = NULL;
@@ -1135,6 +1181,14 @@ int main(int argc, char *argv[])
uint32_t crc;
size_t bar1_size = 0x3000; /* FIXME get this value from region info */
+ struct vfio_user_device_feature *dirty_pages_feature;
+ struct vfio_user_device_feature_dma_logging_control *dirty_pages_control;
+ size_t dirty_pages_size = sizeof(*dirty_pages_feature) +
+ sizeof(*dirty_pages_control);
+ void *dirty_pages = malloc(dirty_pages_size);
+ dirty_pages_feature = dirty_pages;
+ dirty_pages_control = (void *)(dirty_pages_feature + 1);
+
while ((opt = getopt(argc, argv, "h")) != -1) {
switch (opt) {
case 'h':
@@ -1229,11 +1283,16 @@ int main(int argc, char *argv[])
*/
irq_fd = configure_irqs(sock);
- dirty_pages.argsz = sizeof(dirty_pages);
- dirty_pages.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
- ret = tran_sock_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
- &dirty_pages, sizeof(dirty_pages),
- NULL, NULL, 0);
+ /* start dirty pages logging */
+ dirty_pages_feature->argsz = sizeof(*dirty_pages_feature) +
+ sizeof(*dirty_pages_control);
+ dirty_pages_feature->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_START |
+ VFIO_DEVICE_FEATURE_SET;
+ dirty_pages_control->num_ranges = 0;
+ dirty_pages_control->page_size = sysconf(_SC_PAGESIZE);
+
+ ret = tran_sock_msg(sock, 0, VFIO_USER_DEVICE_FEATURE, dirty_pages,
+ dirty_pages_size, NULL, dirty_pages, dirty_pages_size);
if (ret != 0) {
err(EXIT_FAILURE, "failed to start dirty page logging");
}
@@ -1270,11 +1329,16 @@ int main(int argc, char *argv[])
get_dirty_bitmap(sock, &dma_regions[i], i < 2);
}
- dirty_pages.argsz = sizeof(dirty_pages);
- dirty_pages.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
- ret = tran_sock_msg(sock, 0, VFIO_USER_DIRTY_PAGES,
- &dirty_pages, sizeof(dirty_pages),
- NULL, NULL, 0);
+ /* stop logging dirty pages */
+ dirty_pages_feature->argsz = sizeof(*dirty_pages_feature) +
+ sizeof(*dirty_pages_control);
+ dirty_pages_feature->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP |
+ VFIO_DEVICE_FEATURE_SET;
+ dirty_pages_control->num_ranges = 0;
+ dirty_pages_control->page_size = sysconf(_SC_PAGESIZE);
+
+ ret = tran_sock_msg(sock, 0, VFIO_USER_DEVICE_FEATURE, dirty_pages,
+ dirty_pages_size, NULL, dirty_pages, dirty_pages_size);
if (ret != 0) {
err(EXIT_FAILURE, "failed to stop dirty page logging");
}
@@ -1316,7 +1380,8 @@ int main(int argc, char *argv[])
err(EXIT_FAILURE, "failed to write to BAR0");
}
- nr_iters = migrate_from(sock, &nr_iters, &migr_iters, &crc, bar1_size);
+ nr_iters = migrate_from(sock, &nr_iters, &migr_iters, &crc, bar1_size,
+ MIN(server_max_data_xfer_size, CLIENT_MAX_DATA_XFER_SIZE));
/*
* Normally the client would now send the device state to the destination
@@ -1374,6 +1439,7 @@ int main(int argc, char *argv[])
}
free(dma_regions);
+ free(dirty_pages);
return 0;
}
diff --git a/samples/gpio-pci-idio-16.c b/samples/gpio-pci-idio-16.c
index b50f407..6c4e99b 100644
--- a/samples/gpio-pci-idio-16.c
+++ b/samples/gpio-pci-idio-16.c
@@ -77,49 +77,23 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
return 0;
}
-static uint64_t
-migration_get_pending_bytes(UNUSED vfu_ctx_t *vfu_ctx)
+static ssize_t
+migration_read_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf, uint64_t size)
{
+ assert(size == sizeof(pin));
+
if (dirty) {
+ memcpy(buf, &pin, sizeof(pin));
+ dirty = false;
return sizeof(pin);
}
- return 0;
-}
-static int
-migration_prepare_data(UNUSED vfu_ctx_t *vfu_ctx,
- uint64_t *offset, uint64_t *size)
-{
- *offset = 0;
- if (size != NULL) { /* null means resuming */
- *size = sizeof(pin);
- }
return 0;
}
static ssize_t
-migration_read_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf,
- uint64_t size, uint64_t offset)
+migration_write_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf, uint64_t size)
{
- assert(offset == 0);
- assert(size == sizeof(pin));
- memcpy(buf, &pin, sizeof(pin));
- dirty = false;
- return 0;
-}
-
-static int
-migration_data_written(UNUSED vfu_ctx_t *vfu_ctx, uint64_t count)
-{
- assert(count == sizeof(pin));
- return 0;
-}
-
-static ssize_t
-migration_write_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf,
- uint64_t size, uint64_t offset)
-{
- assert(offset == 0);
assert(size == sizeof(pin));
memcpy(&pin, buf, sizeof(pin));
return 0;
@@ -145,16 +119,10 @@ main(int argc, char *argv[])
int opt;
struct sigaction act = { .sa_handler = _sa_handler };
vfu_ctx_t *vfu_ctx;
- size_t migr_regs_size = vfu_get_migr_register_area_size();
- size_t migr_data_size = sysconf(_SC_PAGE_SIZE);
- size_t migr_size = migr_regs_size + migr_data_size;
const vfu_migration_callbacks_t migr_callbacks = {
.version = VFU_MIGR_CALLBACKS_VERS,
.transition = &migration_device_state_transition,
- .get_pending_bytes = &migration_get_pending_bytes,
- .prepare_data = &migration_prepare_data,
.read_data = &migration_read_data,
- .data_written = &migration_data_written,
.write_data = &migration_write_data
};
@@ -214,13 +182,7 @@ main(int argc, char *argv[])
}
if (enable_migr) {
- ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, migr_size,
- NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to setup migration region");
- }
- ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks,
- migr_regs_size);
+ ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks);
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup device migration");
}
diff --git a/samples/server.c b/samples/server.c
index 565974d..5edf674 100644
--- a/samples/server.c
+++ b/samples/server.c
@@ -60,7 +60,7 @@ struct server_data {
size_t bar1_size;
struct dma_regions regions[NR_DMA_REGIONS];
struct {
- uint64_t pending_bytes;
+ uint64_t bytes_transferred;
vfu_migr_state_t state;
} migration;
};
@@ -130,10 +130,6 @@ bar1_access(vfu_ctx_t *vfu_ctx, char * const buf,
}
if (is_write) {
- if (server_data->migration.state == VFU_MIGR_STATE_PRE_COPY) {
- /* dirty the whole thing */
- server_data->migration.pending_bytes = server_data->bar1_size;
- }
memcpy(server_data->bar1 + offset, buf, count);
} else {
memcpy(buf, server_data->bar1, count);
@@ -322,19 +318,24 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
if (setitimer(ITIMER_REAL, &new, NULL) != 0) {
err(EXIT_FAILURE, "failed to disable timer");
}
- server_data->migration.pending_bytes = server_data->bar1_size + sizeof(time_t); /* FIXME BAR0 region size */
+ server_data->migration.bytes_transferred = 0;
break;
case VFU_MIGR_STATE_PRE_COPY:
- /* TODO must be less than size of data region in migration region */
- server_data->migration.pending_bytes = server_data->bar1_size;
+ server_data->migration.bytes_transferred = 0;
break;
case VFU_MIGR_STATE_STOP:
/* FIXME should gracefully fail */
- assert(server_data->migration.pending_bytes == 0);
+ if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) {
+ assert(server_data->migration.bytes_transferred ==
+ server_data->bar1_size + sizeof(time_t));
+ }
break;
case VFU_MIGR_STATE_RESUME:
+ server_data->migration.bytes_transferred = 0;
break;
case VFU_MIGR_STATE_RUNNING:
+ assert(server_data->migration.bytes_transferred ==
+ server_data->bar1_size + sizeof(time_t));
ret = arm_timer(vfu_ctx, server_data->bar0);
if (ret < 0) {
return ret;
@@ -347,125 +348,100 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
return 0;
}
-static uint64_t
-migration_get_pending_bytes(vfu_ctx_t *vfu_ctx)
-{
- struct server_data *server_data = vfu_get_private(vfu_ctx);
- return server_data->migration.pending_bytes;
-}
-
-static int
-migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size)
-{
- struct server_data *server_data = vfu_get_private(vfu_ctx);
-
- *offset = 0;
- if (size != NULL) {
- *size = server_data->migration.pending_bytes;
- }
- return 0;
-}
-
static ssize_t
-migration_read_data(vfu_ctx_t *vfu_ctx, void *buf,
- uint64_t size, uint64_t offset)
+migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t size)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
- if (server_data->migration.state != VFU_MIGR_STATE_PRE_COPY &&
- server_data->migration.state != VFU_MIGR_STATE_STOP_AND_COPY)
- {
- return size;
- }
-
/*
- * For ease of implementation we expect the client to read all migration
- * data in one go; partial reads are not supported. This is allowed by VFIO
- * however we don't yet support it. Similarly, when resuming, partial
- * writes are supported by VFIO, however we don't in this sample.
- *
* If in pre-copy state we copy BAR1, if in stop-and-copy state we copy
* both BAR1 and BAR0. Since we always copy BAR1 in the stop-and-copy state,
* copying BAR1 in the pre-copy state is pointless. Fixing this requires
* more complex state tracking which exceeds the scope of this sample.
*/
- if (offset != 0 || size != server_data->migration.pending_bytes) {
- errno = EINVAL;
- return -1;
- }
+ uint32_t total_to_read = server_data->bar1_size;
- memcpy(buf, server_data->bar1, server_data->bar1_size);
if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) {
- memcpy(buf + server_data->bar1_size, &server_data->bar0,
- sizeof(server_data->bar0));
+ total_to_read += sizeof(server_data->bar0);
+ }
+
+ if (server_data->migration.bytes_transferred == total_to_read || size == 0) {
+ vfu_log(vfu_ctx, LOG_DEBUG, "no data left to read");
+ return 0;
+ }
+
+ uint32_t read_start = server_data->migration.bytes_transferred;
+ uint32_t read_end = MIN(read_start + size, total_to_read);
+ assert(read_end > read_start);
+
+ uint32_t bytes_read = read_end - read_start;
+
+ uint32_t length_in_bar1 = 0;
+ uint32_t length_in_bar0 = 0;
+
+ /* read bar1, if any */
+ if (read_start < server_data->bar1_size) {
+ length_in_bar1 = MIN(bytes_read, server_data->bar1_size - read_start);
+ memcpy(buf, server_data->bar1 + read_start, length_in_bar1);
+ read_start += length_in_bar1;
+ }
+
+ /* read bar0, if any */
+ if (read_end > server_data->bar1_size) {
+ length_in_bar0 = read_end - read_start;
+ read_start -= server_data->bar1_size;
+ memcpy(buf + length_in_bar1, &server_data->bar0 + read_start,
+ length_in_bar0);
}
- server_data->migration.pending_bytes = 0;
- return size;
+ server_data->migration.bytes_transferred += bytes_read;
+
+ return bytes_read;
}
static ssize_t
-migration_write_data(vfu_ctx_t *vfu_ctx, void *data,
- uint64_t size, uint64_t offset)
+migration_write_data(vfu_ctx_t *vfu_ctx, void *data, uint64_t size)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
char *buf = data;
- int ret;
assert(server_data != NULL);
assert(data != NULL);
- if (offset != 0 || size < server_data->bar1_size) {
- vfu_log(vfu_ctx, LOG_DEBUG, "XXX bad migration data write %#llx-%#llx",
- (unsigned long long)offset,
- (unsigned long long)offset + size - 1);
- errno = EINVAL;
- return -1;
- }
+ uint32_t total_to_write = server_data->bar1_size + sizeof(server_data->bar0);
- memcpy(server_data->bar1, buf, server_data->bar1_size);
- buf += server_data->bar1_size;
- size -= server_data->bar1_size;
- if (size == 0) {
+ if (server_data->migration.bytes_transferred == total_to_write || size == 0) {
return 0;
}
- if (size != sizeof(server_data->bar0)) {
- errno = EINVAL;
- return -1;
- }
- memcpy(&server_data->bar0, buf, sizeof(server_data->bar0));
- ret = bar0_access(vfu_ctx, buf, sizeof(server_data->bar0), 0, true);
- assert(ret == (int)size); /* FIXME */
- return 0;
-}
+ uint32_t write_start = server_data->migration.bytes_transferred;
+ uint32_t write_end = MIN(write_start + size, total_to_write); // exclusive
+ assert(write_end > write_start);
+ uint32_t bytes_written = write_end - write_start;
-static int
-migration_data_written(UNUSED vfu_ctx_t *vfu_ctx, UNUSED uint64_t count)
-{
- /*
- * We apply migration state directly in the migration_write_data callback,
- * so we don't need to do anything here. We would have to apply migration
- * state in this callback if the migration region was memory mappable, in
- * which case we wouldn't know when the client wrote migration data.
- */
+ uint32_t length_in_bar1 = 0;
+ uint32_t length_in_bar0 = 0;
- return 0;
-}
+ /* write to bar1, if any */
+ if (write_start < server_data->bar1_size) {
+ length_in_bar1 = MIN(bytes_written, server_data->bar1_size - write_start);
+ memcpy(server_data->bar1 + write_start, buf, length_in_bar1);
+ write_start += length_in_bar1;
+ }
-static size_t
-nr_pages(size_t size)
-{
- return (size / sysconf(_SC_PAGE_SIZE) +
- (size % sysconf(_SC_PAGE_SIZE) > 1));
-}
+ /* write to bar0, if any */
+ if (write_end > server_data->bar1_size) {
+ length_in_bar0 = write_end - write_start;
+ write_start -= server_data->bar1_size;
+ memcpy(&server_data->bar0 + write_start, buf + length_in_bar1,
+ length_in_bar0);
+ }
-static size_t
-page_align(size_t size)
-{
- return nr_pages(size) * sysconf(_SC_PAGE_SIZE);
+ server_data->migration.bytes_transferred += bytes_written;
+
+ return bytes_written;
}
int main(int argc, char *argv[])
@@ -476,7 +452,6 @@ int main(int argc, char *argv[])
int opt;
struct sigaction act = {.sa_handler = _sa_handler};
const size_t bar1_size = 0x3000;
- size_t migr_regs_size, migr_data_size, migr_size;
struct server_data server_data = {
.migration = {
.state = VFU_MIGR_STATE_RUNNING
@@ -488,10 +463,7 @@ int main(int argc, char *argv[])
const vfu_migration_callbacks_t migr_callbacks = {
.version = VFU_MIGR_CALLBACKS_VERS,
.transition = &migration_device_state_transition,
- .get_pending_bytes = &migration_get_pending_bytes,
- .prepare_data = &migration_prepare_data,
.read_data = &migration_read_data,
- .data_written = &migration_data_written,
.write_data = &migration_write_data
};
@@ -550,9 +522,6 @@ int main(int argc, char *argv[])
* are mappable. The client can still mmap the 2nd page, we can't prohibit
* this under Linux. If we really want to prohibit it we have to use
* separate files for the same region.
- *
- * We choose to use a single file which contains both BAR1 and the migration
- * registers. They could also be completely different files.
*/
if ((tmpfd = mkstemp(template)) == -1) {
err(EXIT_FAILURE, "failed to create backing file");
@@ -562,16 +531,7 @@ int main(int argc, char *argv[])
server_data.bar1_size = bar1_size;
- /*
- * The migration registers aren't memory mappable, so in order to make the
- * rest of the migration region memory mappable we must effectively reserve
- * an entire page.
- */
- migr_regs_size = vfu_get_migr_register_area_size();
- migr_data_size = page_align(bar1_size + sizeof(time_t));
- migr_size = migr_regs_size + migr_data_size;
-
- if (ftruncate(tmpfd, server_data.bar1_size + migr_size) == -1) {
+ if (ftruncate(tmpfd, server_data.bar1_size) == -1) {
err(EXIT_FAILURE, "failed to truncate backing file");
}
server_data.bar1 = mmap(NULL, server_data.bar1_size, PROT_READ | PROT_WRITE,
@@ -591,29 +551,8 @@ int main(int argc, char *argv[])
err(EXIT_FAILURE, "failed to setup BAR1 region");
}
- /* setup migration */
-
- struct iovec migr_mmap_areas[] = {
- [0] = {
- .iov_base = (void *)migr_regs_size,
- .iov_len = migr_data_size
- },
- };
-
- /*
- * The migration region comes after bar1 in the backing file, so offset is
- * server_data.bar1_size.
- */
- ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, migr_size,
- NULL, VFU_REGION_FLAG_RW, migr_mmap_areas,
- ARRAY_SIZE(migr_mmap_areas), tmpfd,
- server_data.bar1_size);
- if (ret < 0) {
- err(EXIT_FAILURE, "failed to setup migration region");
- }
-
- ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks,
- migr_regs_size);
+ ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks);
+
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup device migration");
}
diff --git a/test/mocks.c b/test/mocks.c
index 2ae14b4..ce3060f 100644
--- a/test/mocks.c
+++ b/test/mocks.c
@@ -200,23 +200,6 @@ should_exec_command(vfu_ctx_t *vfu_ctx, uint16_t cmd)
}
ssize_t
-migration_region_access_registers(vfu_ctx_t *vfu_ctx, char *buf, size_t count,
- loff_t pos, bool is_write)
-{
- if (!is_patched("migration_region_access_registers")) {
- return __real_migration_region_access_registers(vfu_ctx, buf, count,
- pos, is_write);
- }
- check_expected(vfu_ctx);
- check_expected(buf);
- check_expected(count);
- check_expected(pos);
- check_expected(is_write);
- errno = mock();
- return mock();
-}
-
-ssize_t
handle_device_state(vfu_ctx_t *vfu_ctx, struct migration *migr,
uint32_t device_state, bool notify) {
@@ -232,7 +215,8 @@ handle_device_state(vfu_ctx_t *vfu_ctx, struct migration *migr,
}
void
-migr_state_transition(struct migration *migr, enum migr_iter_state state)
+migr_state_transition(struct migration *migr,
+ enum vfio_user_device_mig_state state)
{
if (!is_patched("migr_state_transition")) {
__real_migr_state_transition(migr, state);
diff --git a/test/py/libvfio_user.py b/test/py/libvfio_user.py
index a701d1b..289f10a 100644
--- a/test/py/libvfio_user.py
+++ b/test/py/libvfio_user.py
@@ -43,7 +43,6 @@ import socket
import struct
import syslog
import copy
-import tempfile
import sys
from resource import getpagesize
from math import log2
@@ -126,12 +125,6 @@ VFIO_IRQ_SET_ACTION_TRIGGER = (1 << 5)
VFIO_DMA_UNMAP_FLAG_ALL = (1 << 1)
-VFIO_DEVICE_STATE_V1_STOP = (0)
-VFIO_DEVICE_STATE_V1_RUNNING = (1 << 0)
-VFIO_DEVICE_STATE_V1_SAVING = (1 << 1)
-VFIO_DEVICE_STATE_V1_RESUMING = (1 << 2)
-VFIO_DEVICE_STATE_MASK = ((1 << 3) - 1)
-
# libvfio-user defines
@@ -178,8 +171,11 @@ VFIO_USER_REGION_WRITE = 10
VFIO_USER_DMA_READ = 11
VFIO_USER_DMA_WRITE = 12
VFIO_USER_DEVICE_RESET = 13
-VFIO_USER_DIRTY_PAGES = 14
-VFIO_USER_MAX = 15
+VFIO_USER_REGION_WRITE_MULTI = 15
+VFIO_USER_DEVICE_FEATURE = 16
+VFIO_USER_MIG_DATA_READ = 17
+VFIO_USER_MIG_DATA_WRITE = 18
+VFIO_USER_MAX = 19
VFIO_USER_F_TYPE = 0xf
VFIO_USER_F_TYPE_COMMAND = 0
@@ -198,8 +194,7 @@ VFU_PCI_DEV_BAR5_REGION_IDX = 5
VFU_PCI_DEV_ROM_REGION_IDX = 6
VFU_PCI_DEV_CFG_REGION_IDX = 7
VFU_PCI_DEV_VGA_REGION_IDX = 8
-VFU_PCI_DEV_MIGR_REGION_IDX = 9
-VFU_PCI_DEV_NUM_REGIONS = 10
+VFU_PCI_DEV_NUM_REGIONS = 9
VFU_REGION_FLAG_READ = 1
VFU_REGION_FLAG_WRITE = 2
@@ -212,14 +207,42 @@ VFIO_USER_F_DMA_REGION_WRITE = (1 << 1)
VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP = (1 << 0)
-VFIO_IOMMU_DIRTY_PAGES_FLAG_START = (1 << 0)
-VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP = (1 << 1)
-VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP = (1 << 2)
+# enum vfio_user_device_mig_state
+VFIO_USER_DEVICE_STATE_ERROR = 0
+VFIO_USER_DEVICE_STATE_STOP = 1
+VFIO_USER_DEVICE_STATE_RUNNING = 2
+VFIO_USER_DEVICE_STATE_STOP_COPY = 3
+VFIO_USER_DEVICE_STATE_RESUMING = 4
+VFIO_USER_DEVICE_STATE_RUNNING_P2P = 5
+VFIO_USER_DEVICE_STATE_PRE_COPY = 6
+VFIO_USER_DEVICE_STATE_PRE_COPY_P2P = 7
+
+VFIO_DEVICE_FEATURE_MASK = 0xffff
+VFIO_DEVICE_FEATURE_GET = (1 << 16)
+VFIO_DEVICE_FEATURE_SET = (1 << 17)
+VFIO_DEVICE_FEATURE_PROBE = (1 << 18)
+
+VFIO_DEVICE_FEATURE_MIGRATION = 1
+VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE = 2
+VFIO_DEVICE_FEATURE_DMA_LOGGING_START = 6
+VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP = 7
+VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT = 8
+
+VFIO_MIGRATION_STOP_COPY = (1 << 0)
+VFIO_MIGRATION_P2P = (1 << 1)
+VFIO_MIGRATION_PRE_COPY = (1 << 2)
VFIO_USER_IO_FD_TYPE_IOEVENTFD = 0
VFIO_USER_IO_FD_TYPE_IOREGIONFD = 1
VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW = 2
+# enum vfu_migr_state_t
+VFU_MIGR_STATE_STOP = 0
+VFU_MIGR_STATE_RUNNING = 1
+VFU_MIGR_STATE_STOP_AND_COPY = 2
+VFU_MIGR_STATE_PRE_COPY = 3
+VFU_MIGR_STATE_RESUME = 4
+
# enum vfu_dev_irq_type
VFU_DEV_INTX_IRQ = 0
@@ -244,7 +267,7 @@ VFU_CAP_FLAG_EXTENDED = (1 << 0)
VFU_CAP_FLAG_CALLBACK = (1 << 1)
VFU_CAP_FLAG_READONLY = (1 << 2)
-VFU_MIGR_CALLBACKS_VERS = 1
+VFU_MIGR_CALLBACKS_VERS = 2
SOCK_PATH = b"/tmp/vfio-user.sock.%d" % os.getpid()
@@ -528,14 +551,6 @@ class vfu_dma_info_t(Structure):
return result
-class vfio_user_dirty_pages(Structure):
- _pack_ = 1
- _fields_ = [
- ("argsz", c.c_uint32),
- ("flags", c.c_uint32)
- ]
-
-
class vfio_user_bitmap(Structure):
_pack_ = 1
_fields_ = [
@@ -554,24 +569,73 @@ class vfio_user_bitmap_range(Structure):
transition_cb_t = c.CFUNCTYPE(c.c_int, c.c_void_p, c.c_int, use_errno=True)
-get_pending_bytes_cb_t = c.CFUNCTYPE(c.c_uint64, c.c_void_p)
-prepare_data_cb_t = c.CFUNCTYPE(c.c_void_p, c.POINTER(c.c_uint64),
- c.POINTER(c.c_uint64))
-read_data_cb_t = c.CFUNCTYPE(c.c_ssize_t, c.c_void_p, c.c_void_p,
- c.c_uint64, c.c_uint64)
-write_data_cb_t = c.CFUNCTYPE(c.c_ssize_t, c.c_void_p, c.c_uint64)
-data_written_cb_t = c.CFUNCTYPE(c.c_int, c.c_void_p, c.c_uint64)
+read_data_cb_t = c.CFUNCTYPE(c.c_ssize_t, c.c_void_p, c.c_void_p, c.c_uint64)
+write_data_cb_t = c.CFUNCTYPE(c.c_ssize_t, c.c_void_p, c.c_void_p, c.c_uint64)
class vfu_migration_callbacks_t(Structure):
_fields_ = [
("version", c.c_int),
("transition", transition_cb_t),
- ("get_pending_bytes", get_pending_bytes_cb_t),
- ("prepare_data", prepare_data_cb_t),
("read_data", read_data_cb_t),
("write_data", write_data_cb_t),
- ("data_written", data_written_cb_t),
+ ]
+
+
+class vfio_user_device_feature(Structure):
+ _pack_ = 1
+ _fields_ = [
+ ("argsz", c.c_uint32),
+ ("flags", c.c_uint32)
+ ]
+
+
+class vfio_user_device_feature_migration(Structure):
+ _pack_ = 1
+ _fields_ = [
+ ("flags", c.c_uint64)
+ ]
+
+
+class vfio_user_device_feature_mig_state(Structure):
+ _pack_ = 1
+ _fields_ = [
+ ("device_state", c.c_uint32),
+ ("data_fd", c.c_uint32),
+ ]
+
+
+class vfio_user_device_feature_dma_logging_control(Structure):
+ _pack_ = 1
+ _fields_ = [
+ ("page_size", c.c_uint64),
+ ("num_ranges", c.c_uint32),
+ ("reserved", c.c_uint32),
+ ]
+
+
+class vfio_user_device_feature_dma_logging_range(Structure):
+ _pack_ = 1
+ _fields_ = [
+ ("iova", c.c_uint64),
+ ("length", c.c_uint64),
+ ]
+
+
+class vfio_user_device_feature_dma_logging_report(Structure):
+ _pack_ = 1
+ _fields_ = [
+ ("iova", c.c_uint64),
+ ("length", c.c_uint64),
+ ("page_size", c.c_uint64)
+ ]
+
+
+class vfio_user_mig_data(Structure):
+ _pack_ = 1
+ _fields_ = [
+ ("argsz", c.c_uint32),
+ ("size", c.c_uint32)
]
@@ -590,17 +654,6 @@ class dma_sg_t(Structure):
hex(self.offset), self.writeable)
-class vfio_user_migration_info(Structure):
- _pack_ = 1
- _fields_ = [
- ("device_state", c.c_uint32),
- ("reserved", c.c_uint32),
- ("pending_bytes", c.c_uint64),
- ("data_offset", c.c_uint64),
- ("data_size", c.c_uint64),
- ]
-
-
#
# Util functions
#
@@ -644,7 +697,7 @@ vfu_dma_unregister_cb_t = c.CFUNCTYPE(None, c.c_void_p,
lib.vfu_setup_device_dma.argtypes = (c.c_void_p, vfu_dma_register_cb_t,
vfu_dma_unregister_cb_t)
lib.vfu_setup_device_migration_callbacks.argtypes = (c.c_void_p,
- c.POINTER(vfu_migration_callbacks_t), c.c_uint64)
+ c.POINTER(vfu_migration_callbacks_t))
lib.dma_sg_size.restype = (c.c_size_t)
lib.vfu_addr_to_sgl.argtypes = (c.c_void_p, c.c_void_p, c.c_size_t,
c.POINTER(dma_sg_t), c.c_size_t, c.c_int)
@@ -1019,18 +1072,6 @@ def prepare_ctx_for_dma(dma_register=__dma_register,
ret = vfu_setup_device_reset_cb(ctx, reset)
assert ret == 0
- f = tempfile.TemporaryFile()
- migr_region_size = 2 << PAGE_SHIFT
- f.truncate(migr_region_size)
-
- mmap_areas = [(PAGE_SIZE, PAGE_SIZE)]
-
- ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX,
- size=migr_region_size,
- flags=VFU_REGION_FLAG_RW, mmap_areas=mmap_areas,
- fd=f.fileno())
- assert ret == 0
-
if migration_callbacks:
ret = vfu_setup_device_migration_callbacks(ctx)
assert ret == 0
@@ -1040,6 +1081,18 @@ def prepare_ctx_for_dma(dma_register=__dma_register,
return ctx
+
+def transition_to_state(ctx, sock, state, expect=0, rsp=True, busy=False):
+ feature = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()) +
+ len(vfio_user_device_feature_mig_state()),
+ flags=VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
+ )
+ payload = vfio_user_device_feature_mig_state(device_state=state)
+ msg(ctx, sock, VFIO_USER_DEVICE_FEATURE, bytes(feature) + bytes(payload),
+ expect=expect, rsp=rsp, busy=busy)
+
+
#
# Library wrappers
#
@@ -1235,24 +1288,6 @@ def __migr_trans_cb(ctx, state):
return migr_trans_cb(ctx, state)
-def migr_get_pending_bytes_cb(ctx):
- pass
-
-
-@get_pending_bytes_cb_t
-def __migr_get_pending_bytes_cb(ctx):
- return migr_get_pending_bytes_cb(ctx)
-
-
-def migr_prepare_data_cb(ctx, offset, size):
- pass
-
-
-@prepare_data_cb_t
-def __migr_prepare_data_cb(ctx, offset, size):
- return migr_prepare_data_cb(ctx, offset, size)
-
-
def migr_read_data_cb(ctx, buf, count, offset):
pass
@@ -1271,29 +1306,17 @@ def __migr_write_data_cb(ctx, buf, count, offset):
return migr_write_data_cb(ctx, buf, count, offset)
-def migr_data_written_cb(ctx, count):
- pass
-
-
-@data_written_cb_t
-def __migr_data_written_cb(ctx, count):
- return migr_data_written_cb(ctx, count)
-
-
-def vfu_setup_device_migration_callbacks(ctx, cbs=None, offset=PAGE_SIZE):
+def vfu_setup_device_migration_callbacks(ctx, cbs=None):
assert ctx is not None
if not cbs:
cbs = vfu_migration_callbacks_t()
cbs.version = VFU_MIGR_CALLBACKS_VERS
cbs.transition = __migr_trans_cb
- cbs.get_pending_bytes = __migr_get_pending_bytes_cb
- cbs.prepare_data = __migr_prepare_data_cb
cbs.read_data = __migr_read_data_cb
cbs.write_data = __migr_write_data_cb
- cbs.data_written = __migr_data_written_cb
- return lib.vfu_setup_device_migration_callbacks(ctx, cbs, offset)
+ return lib.vfu_setup_device_migration_callbacks(ctx, cbs)
def dma_sg_size():
@@ -1355,4 +1378,30 @@ def fds_are_same(fd1: int, fd2: int) -> bool:
return s1.st_dev == s2.st_dev and s1.st_ino == s2.st_ino
+def get_bitmap_size(size: int, pgsize: int) -> int:
+ """
+ Returns the size, in bytes, of the bitmap that represents the given range
+ with the given page size.
+ """
+
+ nr_pages = (size // pgsize) + (1 if size % pgsize != 0 else 0)
+ return ((nr_pages + 63) & ~63) // 8
+
+
+get_errno_loc = libc.__errno_location
+get_errno_loc.restype = c.POINTER(c.c_int)
+
+
+def set_real_errno(errno: int):
+ """
+ ctypes's errno is an internal value that only updates the real value when
+ the foreign function call returns. In callbacks, however, this doesn't
+ happen, so `c.set_errno` doesn't propagate in time. In this case we need to
+ manually set the real errno.
+ """
+
+ c.set_errno(errno) # set internal errno so `c.get_errno` gives right value
+ get_errno_loc()[0] = errno # set real errno
+
+
# ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: #
diff --git a/test/py/test_device_get_region_info.py b/test/py/test_device_get_region_info.py
index f847cb4..3b7c32d 100644
--- a/test/py/test_device_get_region_info.py
+++ b/test/py/test_device_get_region_info.py
@@ -78,14 +78,6 @@ def test_device_get_region_info_setup():
mmap_areas=mmap_areas, fd=f.fileno(), offset=0x0)
assert ret == 0
- f = tempfile.TemporaryFile()
- f.truncate(migr_region_size)
-
- ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX,
- size=migr_region_size, flags=VFU_REGION_FLAG_RW,
- mmap_areas=migr_mmap_areas, fd=f.fileno())
- assert ret == 0
-
ret = vfu_realize_ctx(ctx)
assert ret == 0
@@ -207,44 +199,6 @@ def test_device_get_region_info_caps():
client.disconnect(ctx)
-def test_device_get_region_info_migr():
- global client
-
- client = connect_client(ctx)
-
- payload = vfio_region_info(argsz=80, flags=0,
- index=VFU_PCI_DEV_MIGR_REGION_IDX, cap_offset=0,
- size=0, offset=0)
- payload = bytes(payload) + b'\0' * (80 - 32)
-
- result = msg(ctx, client.sock, VFIO_USER_DEVICE_GET_REGION_INFO, payload)
-
- info, result = vfio_region_info.pop_from_buffer(result)
- mcap, result = vfio_region_info_cap_type.pop_from_buffer(result)
- cap, result = vfio_region_info_cap_sparse_mmap.pop_from_buffer(result)
- area, result = vfio_region_sparse_mmap_area.pop_from_buffer(result)
-
- assert info.argsz == 80
- assert info.cap_offset == 32
-
- assert mcap.id == VFIO_REGION_INFO_CAP_TYPE
- assert mcap.version == 1
- assert mcap.next == 48
- assert mcap.type == VFIO_REGION_TYPE_MIGRATION
- assert mcap.subtype == VFIO_REGION_SUBTYPE_MIGRATION
-
- assert cap.id == VFIO_REGION_INFO_CAP_SPARSE_MMAP
- assert cap.version == 1
- assert cap.next == 0
- assert cap.nr_areas == len(migr_mmap_areas) == 1
-
- assert area.offset == migr_mmap_areas[0][0]
- assert area.size == migr_mmap_areas[0][1]
-
- # skip reading the SCM_RIGHTS
- client.disconnect(ctx)
-
-
def test_device_get_region_info_cleanup():
vfu_destroy_ctx(ctx)
diff --git a/test/py/test_device_get_region_info_zero_size.py b/test/py/test_device_get_region_info_zero_size.py
index 146e812..a569191 100644
--- a/test/py/test_device_get_region_info_zero_size.py
+++ b/test/py/test_device_get_region_info_zero_size.py
@@ -52,27 +52,26 @@ def test_device_get_region_info_zero_sized_region():
global client
- for index in [VFU_PCI_DEV_BAR1_REGION_IDX, VFU_PCI_DEV_MIGR_REGION_IDX]:
- payload = vfio_region_info(argsz=argsz, flags=0,
- index=index, cap_offset=0,
- size=0, offset=0)
-
- hdr = vfio_user_header(VFIO_USER_DEVICE_GET_REGION_INFO,
- size=len(payload))
- client.sock.send(hdr + payload)
- vfu_run_ctx(ctx)
- result = get_reply(client.sock)
-
- assert len(result) == argsz
-
- info, _ = vfio_region_info.pop_from_buffer(result)
-
- assert info.argsz == argsz
- assert info.flags == 0
- assert info.index == index
- assert info.cap_offset == 0
- assert info.size == 0
- assert info.offset == 0
+ payload = vfio_region_info(argsz=argsz, flags=0,
+ index=VFU_PCI_DEV_BAR1_REGION_IDX, cap_offset=0,
+ size=0, offset=0)
+
+ hdr = vfio_user_header(VFIO_USER_DEVICE_GET_REGION_INFO,
+ size=len(payload))
+ client.sock.send(hdr + payload)
+ vfu_run_ctx(ctx)
+ result = get_reply(client.sock)
+
+ assert len(result) == argsz
+
+ info, _ = vfio_region_info.pop_from_buffer(result)
+
+ assert info.argsz == argsz
+ assert info.flags == 0
+ assert info.index == VFU_PCI_DEV_BAR1_REGION_IDX
+ assert info.cap_offset == 0
+ assert info.size == 0
+ assert info.offset == 0
vfu_destroy_ctx(ctx)
diff --git a/test/py/test_dirty_pages.py b/test/py/test_dirty_pages.py
index f3e4219..5ff0f84 100644
--- a/test/py/test_dirty_pages.py
+++ b/test/py/test_dirty_pages.py
@@ -34,6 +34,7 @@ import mmap
import tempfile
ctx = None
+client = None
quiesce_errno = 0
@@ -69,16 +70,6 @@ def test_dirty_pages_setup():
ret = vfu_setup_device_dma(ctx, dma_register, dma_unregister)
assert ret == 0
- f = tempfile.TemporaryFile()
- f.truncate(2 << PAGE_SHIFT)
-
- mmap_areas = [(PAGE_SIZE, PAGE_SIZE)]
-
- ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX,
- size=2 << PAGE_SHIFT, flags=VFU_REGION_FLAG_RW,
- mmap_areas=mmap_areas, fd=f.fileno())
- assert ret == 0
-
ret = vfu_realize_ctx(ctx)
assert ret == 0
@@ -100,59 +91,51 @@ def test_dirty_pages_setup():
msg(ctx, client.sock, VFIO_USER_DMA_MAP, payload)
-def test_dirty_pages_short_write():
- payload = struct.pack("I", 8)
-
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.EINVAL)
-
-
-def test_dirty_pages_bad_argsz():
- payload = vfio_user_dirty_pages(argsz=4,
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START)
-
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.EINVAL)
-
-
-def test_dirty_pages_start_no_migration():
- payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START)
-
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.ENOTSUP)
+def test_setup_migration():
+ ret = vfu_setup_device_migration_callbacks(ctx)
+ assert ret == 0
-def test_setup_migr_region():
- ret = vfu_setup_device_migration_callbacks(ctx, offset=PAGE_SIZE)
- assert ret == 0
+def start_logging(addr=None, length=None, page_size=PAGE_SIZE, expect=0):
+ """
+ Start logging dirty writes.
+ If a region and page size are specified, they will be sent to the server to
+ start logging. Otherwise, all regions will be logged and the default page
+ size will be used.
-def test_dirty_pages_start_bad_flags():
- #
- # This is a little cheeky, after vfu_realize_ctx(), but it works at the
- # moment.
- #
- payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=(VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
- VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP))
+ Note: in the current implementation, all regions are logged whether or not
+ you specify a region, as the additional constraint of only logging a
+ certain region is considered an optimisation and is not yet implemented.
+ """
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.EINVAL)
+ if addr is not None:
+ ranges = vfio_user_device_feature_dma_logging_range(
+ iova=addr,
+ length=length
+ )
+ num_ranges = 1
+ else:
+ ranges = bytearray()
+ num_ranges = 0
- payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=(VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
- VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP))
+ feature = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()) +
+ len(vfio_user_device_feature_dma_logging_control()) +
+ len(ranges),
+ flags=VFIO_DEVICE_FEATURE_DMA_LOGGING_START | VFIO_DEVICE_FEATURE_SET)
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.EINVAL)
+ payload = vfio_user_device_feature_dma_logging_control(
+ page_size=page_size,
+ num_ranges=num_ranges,
+ reserved=0)
+ msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE,
+ bytes(feature) + bytes(payload) + bytes(ranges), expect=expect)
-def start_logging():
- payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START)
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload)
+def test_dirty_pages_start_zero_pgsize():
+ start_logging(page_size=0, expect=errno.EINVAL)
def test_dirty_pages_start():
@@ -161,157 +144,65 @@ def test_dirty_pages_start():
start_logging()
-def test_dirty_pages_get_short_read():
- payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
-
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.EINVAL)
-
-
-#
-# This should in fact work; update when it does.
-#
-def test_dirty_pages_get_sub_range():
- argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8
- dirty_pages = vfio_user_dirty_pages(argsz=argsz,
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
- bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8)
- br = vfio_user_bitmap_range(iova=0x11 << PAGE_SHIFT, size=PAGE_SIZE,
- bitmap=bitmap)
-
- payload = bytes(dirty_pages) + bytes(br)
-
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.ENOTSUP)
+def test_dirty_pages_start_different_pgsize():
+ """
+ Once we've started logging with page size PAGE_SIZE, any request to start
+ logging at a different page size should be rejected.
+ """
+ start_logging(page_size=PAGE_SIZE >> 1, expect=errno.EINVAL)
+ start_logging(page_size=PAGE_SIZE << 1, expect=errno.EINVAL)
-def test_dirty_pages_get_bad_page_size():
- argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8
- dirty_pages = vfio_user_dirty_pages(argsz=argsz,
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
- bitmap = vfio_user_bitmap(pgsize=2 << PAGE_SHIFT, size=8)
- br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT,
- size=0x10 << PAGE_SHIFT, bitmap=bitmap)
- payload = bytes(dirty_pages) + bytes(br)
+def get_dirty_page_bitmap(addr=0x10 << PAGE_SHIFT, length=0x10 << PAGE_SHIFT,
+ page_size=PAGE_SIZE, expect=0):
+ """
+ Get the dirty page bitmap from the server for the given region and page
+ size as a 64-bit integer. This function only works for bitmaps that fit
+ within a 64-bit integer because that's what it returns.
+ """
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.EINVAL)
+ bitmap_size = get_bitmap_size(length, page_size)
+ assert bitmap_size == 8
-def test_dirty_pages_get_bad_bitmap_size():
- argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8
- dirty_pages = vfio_user_dirty_pages(argsz=argsz,
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
- bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=1)
- br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT,
- size=0x10 << PAGE_SHIFT, bitmap=bitmap)
+ argsz = len(vfio_user_device_feature()) + \
+ len(vfio_user_device_feature_dma_logging_report()) + \
+ bitmap_size
- payload = bytes(dirty_pages) + bytes(br)
+ feature = vfio_user_device_feature(
+ argsz=argsz,
+ flags=VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT | VFIO_DEVICE_FEATURE_GET
+ )
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.EINVAL)
+ report = vfio_user_device_feature_dma_logging_report(
+ iova=addr,
+ length=length,
+ page_size=page_size
+ )
+ payload = bytes(feature) + bytes(report)
-def test_dirty_pages_get_bad_argsz():
- dirty_pages = vfio_user_dirty_pages(argsz=SERVER_MAX_DATA_XFER_SIZE + 8,
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
- bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE,
- size=SERVER_MAX_DATA_XFER_SIZE + 8)
- br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT,
- size=0x10 << PAGE_SHIFT, bitmap=bitmap)
+ result = msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload,
+ expect=expect)
- payload = bytes(dirty_pages) + bytes(br)
-
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.EINVAL)
-
-
-def test_dirty_pages_get_short_reply():
- dirty_pages = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
- bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8)
- br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT,
- size=0x10 << PAGE_SHIFT, bitmap=bitmap)
-
- payload = bytes(dirty_pages) + bytes(br)
-
- result = msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload)
-
- assert len(result) == len(vfio_user_dirty_pages())
-
- dirty_pages, _ = vfio_user_dirty_pages.pop_from_buffer(result)
-
- argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8
-
- assert dirty_pages.argsz == argsz
- assert dirty_pages.flags == VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP
-
-
-def test_get_dirty_page_bitmap_unmapped():
- argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8
-
- dirty_pages = vfio_user_dirty_pages(argsz=argsz,
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
- bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8)
- br = vfio_user_bitmap_range(iova=0x40 << PAGE_SHIFT,
- size=0x10 << PAGE_SHIFT, bitmap=bitmap)
-
- payload = bytes(dirty_pages) + bytes(br)
-
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload,
- expect=errno.EINVAL)
-
-
-def test_dirty_pages_get_unmodified():
- argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8
-
- dirty_pages = vfio_user_dirty_pages(argsz=argsz,
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
- bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8)
- br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT,
- size=0x10 << PAGE_SHIFT, bitmap=bitmap)
-
- payload = bytes(dirty_pages) + bytes(br)
-
- result = msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload)
+ if expect != 0:
+ return
assert len(result) == argsz
- dirty_pages, result = vfio_user_dirty_pages.pop_from_buffer(result)
-
- assert dirty_pages.argsz == argsz
- assert dirty_pages.flags == VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP
-
- br, result = vfio_user_bitmap_range.pop_from_buffer(result)
-
- assert br.iova == 0x10 << PAGE_SHIFT
- assert br.size == 0x10 << PAGE_SHIFT
+ _, result = vfio_user_device_feature.pop_from_buffer(result)
+ _, result = \
+ vfio_user_device_feature_dma_logging_report.pop_from_buffer(result)
- assert br.bitmap.pgsize == PAGE_SIZE
- assert br.bitmap.size == 8
+ assert len(result) == bitmap_size
+ return struct.unpack("Q", result)[0]
-def get_dirty_page_bitmap():
- argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8
-
- dirty_pages = vfio_user_dirty_pages(argsz=argsz,
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
- bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8)
- br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT,
- size=0x10 << PAGE_SHIFT, bitmap=bitmap)
-
- payload = bytes(dirty_pages) + bytes(br)
-
- result = msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload)
-
- _, result = vfio_user_dirty_pages.pop_from_buffer(result)
- _, result = vfio_user_bitmap_range.pop_from_buffer(result)
-
- assert len(result) == 8
- return struct.unpack("Q", result)[0]
+def test_dirty_pages_get_unmodified():
+ bitmap = get_dirty_page_bitmap()
+ assert bitmap == 0
sg3 = None
@@ -374,6 +265,27 @@ def test_dirty_pages_get_modified():
bitmap = get_dirty_page_bitmap()
assert bitmap == 0b0000001111000001
+ # check dirty bitmap is correctly extended when we give a smaller page size
+ vfu_sgl_put(ctx, sg1, iovec1)
+ vfu_sgl_put(ctx, sg4, iovec4)
+ bitmap = get_dirty_page_bitmap(page_size=PAGE_SIZE >> 1)
+ assert bitmap == 0b00000000000011111111000000000011
+
+ # check dirty bitmap is correctly shortened when we give a larger page size
+ vfu_sgl_put(ctx, sg1, iovec1)
+ vfu_sgl_put(ctx, sg4, iovec4)
+ bitmap = get_dirty_page_bitmap(page_size=PAGE_SIZE << 1)
+ assert bitmap == 0b00011001
+
+ # check dirty bitmap is correctly shortened when we give a page size that
+ # is so large that one bit corresponds to multiple bytes in the raw bitmap
+ vfu_sgl_put(ctx, sg1, iovec1)
+ vfu_sgl_put(ctx, sg4, iovec4)
+ bitmap = get_dirty_page_bitmap(page_size=PAGE_SIZE << 4)
+ assert bitmap == 0b1
+ bitmap = get_dirty_page_bitmap(page_size=PAGE_SIZE << 4)
+ assert bitmap == 0b0
+
# after another two puts, should just be one dirty page
vfu_sgl_put(ctx, sg2, iovec2)
vfu_sgl_put(ctx, sg3, iovec3)
@@ -427,72 +339,76 @@ def test_dirty_pages_get_modified():
assert bitmap == 0b010000000000000000001100
-def test_dirty_pages_stop():
- # FIXME we have a memory leak as we don't free dirty bitmaps when
- # destroying the context.
- payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP)
-
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload)
+def test_dirty_pages_invalid_arguments():
+ # Failed to translate
+ get_dirty_page_bitmap(addr=0xdeadbeef, expect=errno.ENOENT)
+ # Does not exactly match a region (libvfio-user limitation)
+ get_dirty_page_bitmap(addr=(0x10 << PAGE_SHIFT) + 1,
+ length=(0x20 << PAGE_SHIFT) - 1,
+ expect=errno.ENOTSUP)
-def test_dirty_pages_start_with_quiesce():
- global quiesce_errno
+ # Invalid requested bitmap size
+ get_dirty_page_bitmap(page_size=1 << 24, expect=errno.EINVAL)
- quiesce_errno = errno.EBUSY
+ # Region not mapped
+ get_dirty_page_bitmap(addr=0x40 << PAGE_SHIFT, expect=errno.EINVAL)
- payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START)
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, rsp=False, busy=True)
+def stop_logging(addr=None, length=None):
+ if addr is not None:
+ ranges = vfio_user_device_feature_dma_logging_range(
+ iova=addr,
+ length=length
+ )
+ else:
+ ranges = []
- ret = vfu_device_quiesced(ctx, 0)
- assert ret == 0
+ feature = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()) +
+ len(vfio_user_device_feature_dma_logging_control()) +
+ len(ranges),
+ flags=VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP | VFIO_DEVICE_FEATURE_SET)
- # now should be able to get the reply
- get_reply(client.sock, expect=0)
+ payload = vfio_user_device_feature_dma_logging_control(
+ page_size=PAGE_SIZE,
+ num_ranges=(1 if addr is not None else 0),
+ reserved=0)
- quiesce_errno = 0
+ msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE,
+ bytes(feature) + bytes(payload) + bytes(ranges))
-def test_dirty_pages_bitmap_with_quiesce():
- global quiesce_errno
-
- quiesce_errno = errno.EBUSY
+def test_dirty_pages_stop():
+ stop_logging()
- ret, sg1 = vfu_addr_to_sgl(ctx, dma_addr=0x10 << PAGE_SHIFT,
- length=PAGE_SIZE)
- assert ret == 1
- iovec1 = iovec_t()
- ret = vfu_sgl_get(ctx, sg1, iovec1)
- assert ret == 0
- vfu_sgl_put(ctx, sg1, iovec1)
- bitmap = get_dirty_page_bitmap()
- assert bitmap == 0b0000000000000001
+def test_dirty_pages_cleanup():
+ client.disconnect(ctx)
+ vfu_destroy_ctx(ctx)
-def test_dirty_pages_stop_with_quiesce():
- global quiesce_errno
+def test_dirty_pages_uninitialised_dma():
+ global ctx, client
- quiesce_errno = errno.EBUSY
+ ctx = vfu_create_ctx(flags=LIBVFIO_USER_FLAG_ATTACH_NB)
+ assert ctx is not None
- payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP)
+ ret = vfu_pci_init(ctx)
+ assert ret == 0
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, rsp=False, busy=True)
+ vfu_setup_device_quiesce_cb(ctx, quiesce_cb=quiesce_cb)
- ret = vfu_device_quiesced(ctx, 0)
+ ret = vfu_realize_ctx(ctx)
assert ret == 0
- # now should be able to get the reply
- get_reply(client.sock, expect=0)
-
- quiesce_errno = 0
+ client = connect_client(ctx)
+ start_logging(expect=errno.EINVAL)
+ get_dirty_page_bitmap(expect=errno.EINVAL)
-def test_dirty_pages_cleanup():
client.disconnect(ctx)
+
vfu_destroy_ctx(ctx)
# ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab:
diff --git a/test/py/test_dma_unmap.py b/test/py/test_dma_unmap.py
index a1fa94b..b21e072 100644
--- a/test/py/test_dma_unmap.py
+++ b/test/py/test_dma_unmap.py
@@ -113,26 +113,6 @@ def test_dma_unmap_dirty_not_tracking():
expect=errno.EINVAL)
-def test_dma_unmap_dirty_not_mapped():
-
- setup_dma_regions([(PAGE_SIZE, PAGE_SIZE)])
- vfu_setup_device_migration_callbacks(ctx, offset=PAGE_SIZE)
- payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()),
- flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START)
-
- msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload)
-
- argsz = len(vfio_user_dma_unmap()) + len(vfio_user_bitmap()) + 8
- unmap = vfio_user_dma_unmap(argsz=argsz,
- flags=VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, addr=PAGE_SIZE,
- size=PAGE_SIZE)
- bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8)
- payload = bytes(unmap) + bytes(bitmap) + bytes(8)
-
- msg(ctx, client.sock, VFIO_USER_DMA_UNMAP, payload,
- expect=errno.EINVAL)
-
-
def test_dma_unmap_invalid_flags():
setup_dma_regions()
diff --git a/test/py/test_migration.py b/test/py/test_migration.py
index a6327d8..d423119 100644
--- a/test/py/test_migration.py
+++ b/test/py/test_migration.py
@@ -1,7 +1,8 @@
#
-# Copyright (c) 2021 Nutanix Inc. All rights reserved.
+# Copyright (c) 2023 Nutanix Inc. All rights reserved.
#
# Authors: Thanos Makatos <thanos@nutanix.com>
+# William Henderson <william.henderson@nutanix.com>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
@@ -28,25 +29,143 @@
#
from libvfio_user import *
-import ctypes as c
+from collections import deque
+import ctypes
import errno
-from unittest.mock import patch
ctx = None
client = None
+current_state = None # the current migration state on the server
+path = [] # the server transition path (each transition appends the new state)
-def setup_function(function):
+read_data = None
+write_data = None
+callbacks_errno = 0
+
+
+STATES = {
+ VFIO_USER_DEVICE_STATE_STOP,
+ VFIO_USER_DEVICE_STATE_RUNNING,
+ VFIO_USER_DEVICE_STATE_STOP_COPY,
+ VFIO_USER_DEVICE_STATE_RESUMING,
+ VFIO_USER_DEVICE_STATE_PRE_COPY
+}
+
+
+UNREACHABLE_STATES = {
+ VFIO_USER_DEVICE_STATE_ERROR,
+ VFIO_USER_DEVICE_STATE_PRE_COPY_P2P,
+ VFIO_USER_DEVICE_STATE_RUNNING_P2P
+}
+
+
+VFU_TO_VFIO_MIGR_STATE = {
+ VFU_MIGR_STATE_STOP: VFIO_USER_DEVICE_STATE_STOP,
+ VFU_MIGR_STATE_RUNNING: VFIO_USER_DEVICE_STATE_RUNNING,
+ VFU_MIGR_STATE_STOP_AND_COPY: VFIO_USER_DEVICE_STATE_STOP_COPY,
+ VFU_MIGR_STATE_RESUME: VFIO_USER_DEVICE_STATE_RESUMING,
+ VFU_MIGR_STATE_PRE_COPY: VFIO_USER_DEVICE_STATE_PRE_COPY
+}
+
+
+# Set a very small maximum transfer size for later tests.
+MAX_DATA_XFER_SIZE = 4
+
+
+@transition_cb_t
+def migr_trans_cb(_ctx, state):
+ global current_state, path
+
+ if callbacks_errno != 0:
+ set_real_errno(callbacks_errno)
+ return -1
+
+ if state in VFU_TO_VFIO_MIGR_STATE:
+ state = VFU_TO_VFIO_MIGR_STATE[state]
+ else:
+ assert False
+
+ current_state = state
+
+ path.append(state)
+
+ return 0
+
+
+@read_data_cb_t
+def migr_read_data_cb(_ctx, buf, count):
+ global read_data
+
+ if callbacks_errno != 0:
+ set_real_errno(callbacks_errno)
+ return -1
+
+ length = min(count, len(read_data))
+ ctypes.memmove(buf, read_data, length)
+ read_data = None
+
+ return length
+
+
+@write_data_cb_t
+def migr_write_data_cb(_ctx, buf, count):
+ global write_data
+
+ if callbacks_errno != 0:
+ set_real_errno(callbacks_errno)
+ return -1
+
+ write_data = bytes(count)
+ ctypes.memmove(write_data, buf, count)
+
+ return count
+
+
+def setup_fail_callbacks(errno):
+ global callbacks_errno
+ callbacks_errno = errno
+
+
+def teardown_fail_callbacks():
+ global callbacks_errno
+ callbacks_errno = 0
+ c.set_errno(0)
+
+
+def teardown_function(function):
+ teardown_fail_callbacks()
+
+
+def transition_to_migr_state(state, expect=0, rsp=True, busy=False):
+ return transition_to_state(ctx, client.sock, state, expect, rsp, busy)
+
+
+def mig_data_payload(data):
+ argsz = len(vfio_user_mig_data()) + len(data)
+ return vfio_user_mig_data(
+ argsz=argsz,
+ size=len(data)
+ )
+
+
+def test_migration_setup():
global ctx, client
ctx = vfu_create_ctx(flags=LIBVFIO_USER_FLAG_ATTACH_NB)
assert ctx is not None
- ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX,
- size=2 << PAGE_SHIFT, flags=VFU_REGION_FLAG_RW)
- assert ret == 0
+ cbs = vfu_migration_callbacks_t()
+ cbs.version = 1 # old callbacks version
+ cbs.transition = migr_trans_cb
+ cbs.read_data = migr_read_data_cb
+ cbs.write_data = migr_write_data_cb
- ret = vfu_setup_device_migration_callbacks(ctx)
+ ret = vfu_setup_device_migration_callbacks(ctx, cbs)
+ assert ret < 0, "do not allow old callbacks version"
+
+ cbs.version = VFU_MIGR_CALLBACKS_VERS # new callbacks version
+ ret = vfu_setup_device_migration_callbacks(ctx, cbs)
assert ret == 0
vfu_setup_device_quiesce_cb(ctx)
@@ -54,113 +173,409 @@ def setup_function(function):
ret = vfu_realize_ctx(ctx)
assert ret == 0
- client = connect_client(ctx)
-
+ caps = {
+ "capabilities": {
+ "max_data_xfer_size": MAX_DATA_XFER_SIZE,
+ }
+ }
-def teardown_function(function):
- global ctx
- vfu_destroy_ctx(ctx)
+ client = connect_client(ctx, caps)
-@patch('libvfio_user.quiesce_cb')
-@patch('libvfio_user.migr_trans_cb')
-def test_migration_bad_access(mock_trans, mock_quiesce):
+def server_transition_track_path(a, b, expectA=0, expectB=0):
"""
- Tests that attempting to access the migration state register in an
- non-aligned manner fails.
-
- This test is important because we tell whether we need to quiesce by
- checking for a register-sized access, otherwise we'll change migration
- state without having quiesced.
+ Carry out the state transition from a to b on the server, keeping track of
+ and returning the transition path taken.
"""
- global ctx, client
- data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little')
- write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0,
- count=len(data)-1, data=data, expect=errno.EINVAL)
+ global path
- mock_trans.assert_not_called()
+ if current_state == VFIO_USER_DEVICE_STATE_STOP_COPY and \
+ a == VFIO_USER_DEVICE_STATE_PRE_COPY:
+ # The transition STOP_COPY -> PRE_COPY is explicitly blocked so we
+ # advance one state to get around this in order to set up the test.
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_STOP)
+ transition_to_migr_state(a, expect=expectA)
-@patch('libvfio_user.quiesce_cb')
-@patch('libvfio_user.migr_trans_cb', return_value=0)
-def test_migration_trans_sync(mock_trans, mock_quiesce):
- """
- Tests transitioning to the saving state.
- """
+ if expectA != 0:
+ return None
- global ctx, client
+ path = []
- data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little')
- write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0,
- count=len(data), data=data)
+ transition_to_migr_state(b, expect=expectB)
- ret = vfu_run_ctx(ctx)
- assert ret == 0
+ return path.copy()
-@patch('libvfio_user.migr_trans_cb', side_effect=fail_with_errno(errno.EPERM))
-def test_migration_trans_sync_err(mock_trans):
+def test_migration_shortest_state_transition_paths():
"""
- Tests the device returning an error when the migration state is written to.
+ The spec dictates that complex state transitions are to be implemented as
+ combinations of the defined direct transitions, with the path selected
+ according to the following rules:
+
+ - Select the shortest path.
+ - The path cannot have saving group states as interior arcs, only start/end
+ states.
+
+ This test implements a breadth-first search to ensure that the paths taken
+ by the implementation correctly follow these rules.
"""
- global ctx, client
+ # allowed direct transitions (edges)
+ E = {
+ VFIO_USER_DEVICE_STATE_ERROR: set(),
+ VFIO_USER_DEVICE_STATE_STOP: {
+ VFIO_USER_DEVICE_STATE_RUNNING,
+ VFIO_USER_DEVICE_STATE_STOP_COPY,
+ VFIO_USER_DEVICE_STATE_RESUMING
+ },
+ VFIO_USER_DEVICE_STATE_RUNNING: {
+ VFIO_USER_DEVICE_STATE_STOP,
+ VFIO_USER_DEVICE_STATE_PRE_COPY
+ },
+ VFIO_USER_DEVICE_STATE_STOP_COPY: {VFIO_USER_DEVICE_STATE_STOP},
+ VFIO_USER_DEVICE_STATE_RESUMING: {VFIO_USER_DEVICE_STATE_STOP},
+ VFIO_USER_DEVICE_STATE_RUNNING_P2P: set(),
+ VFIO_USER_DEVICE_STATE_PRE_COPY: {
+ VFIO_USER_DEVICE_STATE_RUNNING,
+ VFIO_USER_DEVICE_STATE_STOP_COPY
+ },
+ VFIO_USER_DEVICE_STATE_PRE_COPY_P2P: set()
+ }
+
+ # states (vertices)
+ V = E.keys()
+
+ # "saving states" which cannot be internal arcs
+ saving_states = {VFIO_USER_DEVICE_STATE_PRE_COPY,
+ VFIO_USER_DEVICE_STATE_STOP_COPY}
+
+ # Consider each vertex in turn to be the start state, that is, the state
+ # we are transitioning from.
+ for source in V:
+ # The previous node in the shortest path for each node, e.g. for
+ # shortest path `source -> node -> target`, `back[node] == source`.
+ back = {v: None for v in V}
+ queue = deque([(source, None)])
+
+ # Use BFS to calculate the shortest path from the start state to every
+ # other state, following the rule that no intermediate states can be
+ # saving states.
+ while len(queue) > 0:
+ (curr, prev) = queue.popleft()
+ back[curr] = prev
+
+ # Intermediate states cannot be saving states, so if our current
+ # node is not the start state and it is a saving state, it is only
+ # allowed to be an end state so we don't explore its neighbours.
+ if curr != source and curr in saving_states:
+ continue
+
+ for nxt in E[curr]:
+ if back[nxt] is None:
+ queue.append((nxt, curr))
+
+ # Iterate over the states
+ for target in V:
+ if source == VFIO_USER_DEVICE_STATE_STOP_COPY \
+ and target == VFIO_USER_DEVICE_STATE_PRE_COPY:
+ # test for this transition being blocked in a separate test
+ continue
+
+ # If BFS found a path to that state, follow the backpointers to
+ # calculate the path, and check that it's equal to the path taken
+ # by the server.
+ if back[target] is not None:
+ seq = deque([])
+ curr = target
+ while curr != source:
+ seq.appendleft(curr)
+ curr = back[curr]
+
+ server_seq = server_transition_track_path(source, target)
+
+ assert len(seq) == len(server_seq)
+ assert all(seq[i] == server_seq[i] for i in range(len(seq)))
+
+ # If BFS couldn't find a path to that state, check that the server
+ # doesn't allow that transition either.
+ else:
+ # If the start state is an unreachable state, we won't be able
+ # to transition into it in order to try and calculate a path on
+ # the server, so we expect that transition to fail.
+ expectA = errno.EINVAL if source in UNREACHABLE_STATES else 0
+
+ # No matter what, we expect transitioning to the target state
+ # to fail.
+ server_transition_track_path(source, target, expectA=expectA,
+ expectB=errno.EINVAL)
+
+
+def test_migration_stop_copy_to_pre_copy_rejected():
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_STOP_COPY)
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_PRE_COPY,
+ expect=errno.EINVAL)
+
+
+def test_migration_nonexistent_state():
+ transition_to_migr_state(0xabcd, expect=errno.EINVAL)
+
+
+def test_migration_failed_callback():
+ setup_fail_callbacks(0xbeef)
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING, expect=0xbeef)
+ assert c.get_errno() == 0xbeef
+ teardown_fail_callbacks()
+
+
+def test_migration_get_state():
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING)
+
+ feature = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()) +
+ len(vfio_user_device_feature_mig_state()),
+ flags=VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
+ )
+
+ result = msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, feature)
+ _, result = vfio_user_device_feature.pop_from_buffer(result)
+ state, _ = vfio_user_device_feature_mig_state.pop_from_buffer(result)
+ assert state.device_state == VFIO_USER_DEVICE_STATE_RUNNING
- data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little')
- write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0,
- count=len(data), data=data, expect=errno.EPERM)
- ret = vfu_run_ctx(ctx)
- assert ret == 0
+def test_handle_mig_data_read():
+ global read_data
+
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING)
+
+ data = bytes([0, 1, 2, 3])
+ payload = mig_data_payload(data)
+
+ VALID_STATES = {VFIO_USER_DEVICE_STATE_PRE_COPY,
+ VFIO_USER_DEVICE_STATE_STOP_COPY}
+
+ for state in STATES:
+ transition_to_migr_state(state)
+ read_data = data
+ expect = 0 if state in VALID_STATES else errno.EINVAL
+ result = msg(ctx, client.sock, VFIO_USER_MIG_DATA_READ, payload,
+ expect=expect)
+
+ if state in VALID_STATES:
+ assert len(result) == len(payload) + len(data)
+ assert result[len(vfio_user_mig_data()):] == data
-@patch('libvfio_user.quiesce_cb', side_effect=fail_with_errno(errno.EBUSY))
-@patch('libvfio_user.migr_trans_cb', return_value=0)
-def test_migration_trans_async(mock_trans, mock_quiesce):
+def test_handle_mig_data_read_too_long():
"""
- Tests transitioning to the saving state where the device is initially busy
- quiescing.
+ When we set up the tests at the top of this file we specify that the max
+ data transfer size is 4 bytes. Here we test to check that a transfer of too
+ many bytes fails.
"""
- global ctx, client
- mock_quiesce
+ global read_data
- data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little')
- write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0,
- count=len(data), data=data, rsp=False,
- busy=True)
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING)
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_PRE_COPY)
- ret = vfu_device_quiesced(ctx, 0)
- assert ret == 0
+ # Create a payload reading with length 1 byte longer than the max.
+ read_data = bytes([i for i in range(MAX_DATA_XFER_SIZE + 1)])
+ payload = mig_data_payload(read_data)
- get_reply(client.sock)
+ msg(ctx, client.sock, VFIO_USER_MIG_DATA_READ, payload,
+ expect=errno.EINVAL)
+
+
+def test_handle_mig_data_read_failed_callback():
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_PRE_COPY)
+
+ read_data = bytes([1, 2, 3, 4])
+ payload = mig_data_payload(read_data)
+
+ setup_fail_callbacks(0xbeef)
+
+ msg(ctx, client.sock, VFIO_USER_MIG_DATA_READ, payload, expect=0xbeef)
+ assert c.get_errno() == 0xbeef
+
+
+def test_handle_mig_data_read_short_write():
+ data = bytes([1, 2, 3, 4])
+ payload = bytes(mig_data_payload(data))
+
+ # don't send the last byte
+ msg(ctx, client.sock, VFIO_USER_MIG_DATA_READ, payload[:-1],
+ expect=errno.EINVAL)
+
+
+def test_handle_mig_data_write():
+ data = bytes([1, 2, 3, 4])
+ payload = mig_data_payload(data)
+
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_RESUMING)
+ msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, bytes(payload) + data)
+ assert write_data == data
- ret = vfu_run_ctx(ctx)
- assert ret == 0
+def test_handle_mig_data_write_invalid_state():
+ data = bytes([1, 2, 3, 4])
+ payload = mig_data_payload(data)
-@patch('libvfio_user.quiesce_cb', side_effect=fail_with_errno(errno.EBUSY))
-@patch('libvfio_user.migr_trans_cb', side_effect=fail_with_errno(errno.ENOTTY))
-def test_migration_trans_async_err(mock_trans, mock_quiesce):
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING)
+ msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, bytes(payload) + data,
+ expect=errno.EINVAL)
+
+
+def test_handle_mig_data_write_too_long():
"""
- Tests writing to the migration state register, the device not being able to
- immediately quiesce, and then finally the device failing to transition to
- the new migration state.
+ When we set up the tests at the top of this file we specify that the max
+ data transfer size is 4 bytes. Here we test to check that a transfer of too
+ many bytes fails.
"""
- global ctx, client
+ # Create a payload writing with length 1 byte longer than the max.
+ data = bytes([i for i in range(MAX_DATA_XFER_SIZE + 1)])
+ payload = mig_data_payload(data)
- data = VFIO_DEVICE_STATE_V1_RUNNING.to_bytes(c.sizeof(c.c_int), 'little')
- write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0,
- count=len(data), data=data, rsp=False,
- busy=True)
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_RESUMING)
+ msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, bytes(payload) + data,
+ expect=errno.EINVAL)
- ret = vfu_device_quiesced(ctx, 0)
- assert ret == 0
- print("waiting for reply")
- get_reply(client.sock, errno.ENOTTY)
- print("received reply")
+def test_handle_mig_data_write_failed_callback():
+ transition_to_migr_state(VFIO_USER_DEVICE_STATE_RESUMING)
+
+ data = bytes([1, 2, 3, 4])
+ payload = mig_data_payload(data)
+
+ setup_fail_callbacks(0xbeef)
+
+ msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, bytes(payload) + data,
+ expect=0xbeef)
+ assert c.get_errno() == 0xbeef
+
+
+def test_handle_mig_data_write_short_write():
+ data = bytes([1, 2, 3, 4])
+ payload = mig_data_payload(data)
+
+ msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, payload,
+ expect=errno.EINVAL)
+
+
+def test_device_feature_migration_get():
+ payload = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()) +
+ len(vfio_user_device_feature_migration()),
+ flags=VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION
+ )
+
+ result = msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload)
+ _, result = vfio_user_device_feature.pop_from_buffer(result)
+ flags, _ = vfio_user_device_feature_migration.pop_from_buffer(result)
+ flags = flags.flags
+
+ assert flags == VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY
+
+
+def test_device_feature_short_write():
+ payload = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()) +
+ len(vfio_user_device_feature_migration()),
+ flags=VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION
+ )
+
+ payload = bytes(payload)
+
+ # don't send the last byte
+ msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload[:-1],
+ expect=errno.EINVAL)
+
+
+def test_device_feature_unsupported_operation():
+ payload = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()) +
+ len(vfio_user_device_feature_migration()),
+ flags=VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIGRATION
+ )
+
+ msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload,
+ expect=errno.EINVAL)
+
+
+def test_device_feature_bad_argsz_probe():
+ payload = vfio_user_device_feature(
+ argsz=2,
+ flags=VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_MIGRATION
+ )
+
+ msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload,
+ expect=errno.EINVAL)
+
+
+def test_device_feature_bad_argsz_get_migration():
+ payload = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()),
+ flags=VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION
+ )
+
+ msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload,
+ expect=errno.EINVAL)
+
+
+def test_device_feature_bad_argsz_get_dma():
+ argsz = len(vfio_user_device_feature()) + \
+ len(vfio_user_device_feature_dma_logging_report()) + \
+ get_bitmap_size(0x20 << PAGE_SHIFT, PAGE_SIZE)
+
+ feature = vfio_user_device_feature(
+ argsz=argsz - 1, # not big enough
+ flags=VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT | VFIO_DEVICE_FEATURE_GET
+ )
+
+ report = vfio_user_device_feature_dma_logging_report(
+ iova=0x10 << PAGE_SHIFT,
+ length=0x20 << PAGE_SHIFT,
+ page_size=PAGE_SIZE
+ )
+
+ msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, bytes(feature)
+ + bytes(report), expect=errno.EINVAL)
+
+
+def test_device_feature_bad_argsz_set():
+ feature = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()), # no space for state data
+ flags=VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
+ )
+ payload = vfio_user_device_feature_mig_state(
+ device_state=VFIO_USER_DEVICE_STATE_RUNNING
+ )
+ msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, bytes(feature)
+ + bytes(payload), expect=errno.EINVAL)
+
+
+def test_device_feature_probe():
+ payload = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()),
+ flags=VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_MIGRATION
+ )
+
+ result = msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload)
+ assert bytes(payload) == result
+
+ payload = vfio_user_device_feature(
+ argsz=len(vfio_user_device_feature()),
+ flags=VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_SET |
+ VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION
+ )
+
+ msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload,
+ expect=errno.EINVAL)
+
+
+def test_migration_cleanup():
+ client.disconnect(ctx)
+ vfu_destroy_ctx(ctx)
# ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: #
diff --git a/test/py/test_quiesce.py b/test/py/test_quiesce.py
index 3f72827..3e1dbca 100644
--- a/test/py/test_quiesce.py
+++ b/test/py/test_quiesce.py
@@ -31,9 +31,10 @@ from libvfio_user import *
import errno
from unittest import mock
from unittest.mock import patch
-
+import tempfile
ctx = None
+client = None
def setup_function(function):
@@ -197,32 +198,28 @@ def test_allowed_funcs_in_quiesced_dma_unregister_busy(mock_quiesce,
@patch('libvfio_user.migr_trans_cb', side_effect=_side_effect)
@patch('libvfio_user.quiesce_cb')
-def test_allowed_funcs_in_quiesed_migration(mock_quiesce,
+def test_allowed_funcs_in_quiesced_migration(mock_quiesce,
mock_trans):
global ctx, client
_map_dma_region(ctx, client.sock)
- data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little')
- write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0,
- count=len(data), data=data)
- mock_trans.assert_called_once_with(ctx, VFIO_DEVICE_STATE_V1_SAVING)
+ transition_to_state(ctx, client.sock, VFIO_USER_DEVICE_STATE_STOP)
+ mock_trans.assert_called_once_with(ctx, VFU_MIGR_STATE_STOP)
@patch('libvfio_user.migr_trans_cb', side_effect=_side_effect)
@patch('libvfio_user.quiesce_cb')
-def test_allowed_funcs_in_quiesed_migration_busy(mock_quiesce,
+def test_allowed_funcs_in_quiesced_migration_busy(mock_quiesce,
mock_trans):
global ctx, client
_map_dma_region(ctx, client.sock)
mock_quiesce.side_effect = fail_with_errno(errno.EBUSY)
- data = VFIO_DEVICE_STATE_V1_STOP.to_bytes(c.sizeof(c.c_int), 'little')
- write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0,
- count=len(data), data=data, rsp=False,
- busy=True)
+ transition_to_state(ctx, client.sock, VFIO_USER_DEVICE_STATE_STOP,
+ busy=True)
ret = vfu_device_quiesced(ctx, 0)
assert ret == 0
- mock_trans.assert_called_once_with(ctx, VFIO_DEVICE_STATE_V1_STOP)
+ mock_trans.assert_called_once_with(ctx, VFU_MIGR_STATE_STOP)
@patch('libvfio_user.reset_cb', side_effect=_side_effect)
diff --git a/test/py/test_request_errors.py b/test/py/test_request_errors.py
index c25a715..1f89e91 100644
--- a/test/py/test_request_errors.py
+++ b/test/py/test_request_errors.py
@@ -54,10 +54,6 @@ def setup_function(function):
ret = vfu_setup_device_reset_cb(ctx)
assert ret == 0
- ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX,
- size=2 << PAGE_SHIFT, flags=VFU_REGION_FLAG_RW)
- assert ret == 0
-
ret = vfu_setup_device_migration_callbacks(ctx)
assert ret == 0
@@ -189,24 +185,21 @@ def test_disconnected_socket_quiesce_busy(mock_quiesce):
@patch('libvfio_user.reset_cb')
@patch('libvfio_user.quiesce_cb', side_effect=fail_with_errno(errno.EBUSY))
-@patch('libvfio_user.migr_get_pending_bytes_cb')
-def test_reply_fail_quiesce_busy(mock_get_pending_bytes, mock_quiesce,
+@patch('libvfio_user.migr_trans_cb')
+def test_reply_fail_quiesce_busy(mock_migr_trans_cb, mock_quiesce,
mock_reset):
"""Tests failing to reply and the quiesce callback returning EBUSY."""
global ctx, client
- def get_pending_bytes_side_effect(ctx):
+ def migr_trans_cb_side_effect(ctx, state):
client.sock.close()
return 0
- mock_get_pending_bytes.side_effect = get_pending_bytes_side_effect
-
- # read the get_pending_bytes register, it should close the socket causing
- # the reply to fail
- read_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX,
- vfio_user_migration_info.pending_bytes.offset,
- vfio_user_migration_info.pending_bytes.size, rsp=False,
- busy=True)
+ mock_migr_trans_cb.side_effect = migr_trans_cb_side_effect
+
+ # change the state, it should close the socket causing the reply to fail
+ transition_to_state(ctx, client.sock, VFIO_USER_DEVICE_STATE_STOP_COPY,
+ rsp=False, busy=True)
# vfu_run_ctx will try to reset the context and to do that it needs to
# quiesce the device first
diff --git a/test/py/test_setup_region.py b/test/py/test_setup_region.py
index 05e6457..f266ed2 100644
--- a/test/py/test_setup_region.py
+++ b/test/py/test_setup_region.py
@@ -111,30 +111,6 @@ def test_setup_region_bad_pci():
assert c.get_errno() == errno.EINVAL
-def test_setup_region_bad_migr():
- ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, size=512,
- flags=(VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM))
- assert ret == -1
- assert c.get_errno() == errno.EINVAL
-
- f = tempfile.TemporaryFile()
- f.truncate(0x2000)
-
- ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, size=0x2000,
- flags=(VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM),
- fd=f.fileno())
- assert ret == -1
- assert c.get_errno() == errno.EINVAL
-
- mmap_areas = [(0x0, 0x1000), (0x1000, 0x1000)]
-
- ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, size=0x2000,
- flags=(VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM),
- mmap_areas=mmap_areas, fd=f.fileno())
- assert ret == -1
- assert c.get_errno() == errno.EINVAL
-
-
def test_setup_region_cfg_always_cb_nocb():
ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_CFG_REGION_IDX,
size=PCI_CFG_SPACE_EXP_SIZE, cb=None,
diff --git a/test/unit-tests.c b/test/unit-tests.c
index 310eb23..fba7225 100644
--- a/test/unit-tests.c
+++ b/test/unit-tests.c
@@ -399,182 +399,6 @@ typedef struct {
} tran_sock_t;
static void
-test_migration_state_transitions(void **state UNUSED)
-{
- bool (*f)(uint32_t, uint32_t) = vfio_migr_state_transition_is_valid;
- uint32_t i, j;
-
- /* from stopped (000b): all transitions are invalid except to running */
- assert_true(f(0, 0));
- assert_true(f(0, 1));
- for (i = 2; i < 8; i++) {
- assert_false(f(0, i));
- }
-
- /* from running (001b) */
- assert_true(f(1, 0));
- assert_true(f(1, 1));
- assert_true(f(1, 2));
- assert_true(f(1, 3));
- assert_true(f(1, 4));
- assert_false(f(1, 5));
- assert_true(f(1, 6));
- assert_false(f(1, 5));
-
- /* from stop-and-copy (010b) */
- assert_true(f(2, 0));
- assert_true(f(2, 1));
- assert_true(f(2, 2));
- assert_false(f(2, 3));
- assert_false(f(2, 4));
- assert_false(f(2, 5));
- assert_true(f(2, 6));
- assert_false(f(2, 7));
-
- /* from pre-copy (011b) */
- assert_true(f(3, 0));
- assert_true(f(3, 1));
- assert_true(f(3, 2));
- assert_false(f(3, 3));
- assert_false(f(3, 4));
- assert_false(f(3, 5));
- assert_true(f(3, 6));
- assert_false(f(3, 7));
-
- /* from resuming (100b) */
- assert_false(f(4, 0));
- assert_true(f(4, 1));
- assert_false(f(4, 2));
- assert_false(f(4, 3));
- assert_true(f(4, 4));
- assert_false(f(4, 5));
- assert_true(f(4, 6));
- assert_false(f(4, 7));
-
- /*
- * Transitioning to any other state from the remaining 3 states
- * (101b - invalid, 110b - error, 111b - invalid) is invalid.
- * Transitioning from the error state to the stopped state is possible but
- * that requires a device reset, so we don't consider it a valid state
- * transition.
- */
- for (i = 5; i < 8; i++) {
- for (j = 0; j < 8; j++) {
- assert_false(f(i, j));
- }
- }
-}
-
-static struct test_setup_migr_reg_dat {
- vfu_ctx_t *v;
- size_t rs; /* migration registers size */
- size_t ds; /* migration data size */
- size_t s; /* migration region size*/
- const vfu_migration_callbacks_t c;
-} migr_reg_data = {
- .c = {
- .version = VFU_MIGR_CALLBACKS_VERS,
- .transition = (void *)0x1,
- .get_pending_bytes = (void *)0x2,
- .prepare_data = (void *)0x3,
- .read_data = (void *)0x4,
- .write_data = (void *)0x5,
- .data_written = (void *)0x6
- }
-};
-
-static int
-setup_test_setup_migration_region(void **state)
-{
- struct test_setup_migr_reg_dat *p = &migr_reg_data;
- p->v = vfu_create_ctx(VFU_TRANS_SOCK, "test", 0, NULL,
- VFU_DEV_TYPE_PCI);
- if (p->v == NULL) {
- return -1;
- }
- p->rs = ROUND_UP(sizeof(struct vfio_user_migration_info),
- sysconf(_SC_PAGE_SIZE));
- p->ds = sysconf(_SC_PAGE_SIZE);
- p->s = p->rs + p->ds;
- *state = p;
- return setup(state);
-}
-
-static vfu_ctx_t *
-get_vfu_ctx(void **state)
-{
- return (*((struct test_setup_migr_reg_dat **)(state)))->v;
-}
-
-static int
-teardown_test_setup_migration_region(void **state)
-{
- struct test_setup_migr_reg_dat *p = *state;
- vfu_destroy_ctx(p->v);
- return 0;
-}
-
-static void
-test_setup_migration_region_size_ok(void **state)
-{
- vfu_ctx_t *v = get_vfu_ctx(state);
- int r = vfu_setup_region(v, VFU_PCI_DEV_MIGR_REGION_IDX,
- vfu_get_migr_register_area_size(), NULL,
- VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE, NULL, 0, -1, 0);
- assert_int_equal(0, r);
-}
-
-static void
-test_setup_migration_region_sparsely_mappable_valid(void **state)
-{
- struct test_setup_migr_reg_dat *p = *state;
- struct iovec mmap_areas[] = {
- [0] = {
- .iov_base = (void *)p->rs,
- .iov_len = p->ds
- }
- };
- int r = vfu_setup_region(p->v, VFU_PCI_DEV_MIGR_REGION_IDX, p->s, NULL,
- VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE, mmap_areas, 1,
- 0xdeadbeef, 0);
- assert_int_equal(0, r);
-}
-
-static void
-test_setup_migration_callbacks_without_migration_region(void **state)
-{
- struct test_setup_migr_reg_dat *p = *state;
- assert_int_equal(-1, vfu_setup_device_migration_callbacks(p->v, &p->c, 0));
- assert_int_equal(EINVAL, errno);
-}
-
-static void
-test_setup_migration_callbacks_bad_data_offset(void **state)
-{
- struct test_setup_migr_reg_dat *p = *state;
- int r = vfu_setup_region(p->v, VFU_PCI_DEV_MIGR_REGION_IDX, p->s, NULL,
- VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE, NULL, 0, -1, 0);
- assert_int_equal(0, r);
- r = vfu_setup_device_migration_callbacks(p->v, &p->c,
- vfu_get_migr_register_area_size() - 1);
- assert_int_equal(-1, r);
-}
-
-static void
-test_setup_migration_callbacks(void **state)
-{
- struct test_setup_migr_reg_dat *p = *state;
- int r = vfu_setup_region(p->v, VFU_PCI_DEV_MIGR_REGION_IDX, p->s, NULL,
- VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE, NULL, 0, -1, 0);
- assert_int_equal(0, r);
- r = vfu_setup_device_migration_callbacks(p->v, &p->c,
- vfu_get_migr_register_area_size());
- assert_int_equal(0, r);
- assert_non_null(p->v->migration);
- /* FIXME can't validate p->v->migration because it's a private strcut, need to move it out of lib/migration.c */
-}
-
-static void
test_device_is_stopped_and_copying(UNUSED void **state)
{
assert_false(device_is_stopped_and_copying(vfu_ctx.migration));
@@ -583,19 +407,16 @@ test_device_is_stopped_and_copying(UNUSED void **state)
size_t i;
struct migration migration;
vfu_ctx.migration = &migration;
- for (i = 0; i < ARRAY_SIZE(migr_states); i++) {
- if (migr_states[i].name == NULL) {
- continue;
- }
- migration.info.device_state = i;
+ for (i = 0; i < VFIO_USER_DEVICE_NUM_STATES; i++) {
+ migration.state = i;
bool r = device_is_stopped_and_copying(vfu_ctx.migration);
- if (i == VFIO_DEVICE_STATE_V1_SAVING) {
+ if (i == VFIO_USER_DEVICE_STATE_STOP_COPY) {
assert_true(r);
} else {
assert_false(r);
}
r = device_is_stopped(vfu_ctx.migration);
- if (i == VFIO_DEVICE_STATE_V1_STOP) {
+ if (i == VFIO_USER_DEVICE_STATE_STOP) {
assert_true(r);
} else {
assert_false(r);
@@ -611,8 +432,10 @@ test_cmd_allowed_when_stopped_and_copying(UNUSED void **state)
for (i = 0; i < VFIO_USER_MAX; i++) {
bool r = cmd_allowed_when_stopped_and_copying(i);
- if (i == VFIO_USER_REGION_READ || i == VFIO_USER_REGION_WRITE ||
- i == VFIO_USER_DIRTY_PAGES) {
+ if (i == VFIO_USER_REGION_READ ||
+ i == VFIO_USER_REGION_WRITE ||
+ i == VFIO_USER_DEVICE_FEATURE ||
+ i == VFIO_USER_MIG_DATA_READ) {
assert_true(r);
} else {
assert_false(r);
@@ -623,7 +446,7 @@ test_cmd_allowed_when_stopped_and_copying(UNUSED void **state)
static void
test_should_exec_command(UNUSED void **state)
{
- struct migration migration = { { 0 } };
+ struct migration migration = { 0 };
vfu_ctx.migration = &migration;
@@ -675,22 +498,6 @@ main(void)
cmocka_unit_test_setup(test_dma_controller_remove_region_unmapped, setup),
cmocka_unit_test_setup(test_dma_addr_to_sgl, setup),
cmocka_unit_test_setup(test_vfu_setup_device_dma, setup),
- cmocka_unit_test_setup(test_migration_state_transitions, setup),
- cmocka_unit_test_setup_teardown(test_setup_migration_region_size_ok,
- setup_test_setup_migration_region,
- teardown_test_setup_migration_region),
- cmocka_unit_test_setup_teardown(test_setup_migration_region_sparsely_mappable_valid,
- setup_test_setup_migration_region,
- teardown_test_setup_migration_region),
- cmocka_unit_test_setup_teardown(test_setup_migration_callbacks_without_migration_region,
- setup_test_setup_migration_region,
- teardown_test_setup_migration_region),
- cmocka_unit_test_setup_teardown(test_setup_migration_callbacks_bad_data_offset,
- setup_test_setup_migration_region,
- teardown_test_setup_migration_region),
- cmocka_unit_test_setup_teardown(test_setup_migration_callbacks,
- setup_test_setup_migration_region,
- teardown_test_setup_migration_region),
cmocka_unit_test_setup(test_device_is_stopped_and_copying, setup),
cmocka_unit_test_setup(test_cmd_allowed_when_stopped_and_copying, setup),
cmocka_unit_test_setup(test_should_exec_command, setup),