aboutsummaryrefslogtreecommitdiff
/*
 * Copyright (c) 2019 Nutanix Inc. All rights reserved.
 *
 * Authors: Thanos Makatos <thanos@nutanix.com>
 *          Swapnil Ingle <swapnil.ingle@nutanix.com>
 *          Felipe Franciosi <felipe@nutanix.com>
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions are met:
 *      * Redistributions of source code must retain the above copyright
 *        notice, this list of conditions and the following disclaimer.
 *      * Redistributions in binary form must reproduce the above copyright
 *        notice, this list of conditions and the following disclaimer in the
 *        documentation and/or other materials provided with the distribution.
 *      * Neither the name of Nutanix nor the names of its contributors may be
 *        used to endorse or promote products derived from this software without
 *        specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 *  ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 *  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 *  DAMAGE.
 *
 */

/*
 * Defines the libvfio-user server-side API.  The protocol definitions can be
 * found in vfio-user.h.
 *
 * This is not currently a stable API or ABI, and may change at any time.
 * Library calls are not guaranteed thread-safe: multi-threaded consumers need
 * to protect calls with their own exclusion methods.
 */

#ifndef LIB_VFIO_USER_H
#define LIB_VFIO_USER_H

#include <stdint.h>
#include <sys/uio.h>
#include <unistd.h>
#include <syslog.h>
#include <sys/queue.h>

#include "pci_caps/dsn.h"
#include "pci_caps/msi.h"
#include "pci_caps/msix.h"
#include "pci_caps/pm.h"
#include "pci_caps/px.h"
#include "pci_defs.h"
#include "vfio-user.h"

#ifdef __cplusplus
extern "C" {
#endif

#define LIB_VFIO_USER_MAJOR 0
#define LIB_VFIO_USER_MINOR 2

/* DMA addresses cannot be directly de-referenced. */
typedef void *vfu_dma_addr_t;

struct dma_sg;
typedef struct dma_sg dma_sg_t;

typedef struct vfu_ctx vfu_ctx_t;

/*
 * Returns the size, in bytes, of dma_sg_t.
 */
size_t
dma_sg_size(void);

/*
 * Attaching to the transport is non-blocking.
 * The caller must then manually call vfu_attach_ctx(),
 * which is non-blocking, as many times as necessary.
 *
 * This also applies to vfu_run_ctx(). However, it's presumed that any actual
 * reads or writes of the socket connection will not need to block, since both
 * APIS are synchronous.
 */
#define LIBVFIO_USER_FLAG_ATTACH_NB  (1 << 0)

typedef enum {
    VFU_TRANS_SOCK,
    // For internal testing only
    VFU_TRANS_PIPE,
    VFU_TRANS_MAX
} vfu_trans_t;

typedef enum {
    VFU_DEV_TYPE_PCI
} vfu_dev_type_t;

/**
 * Creates libvfio-user context. By default one ERR and one REQ IRQs are
 * initialized, this can be overridden with vfu_setup_device_nr_irqs.
 *
 * @trans: transport type
 * @path: path to socket file.
 * @flags: context flags (LIBVFIO_USER_FLAG_*)
 * @pvt: private data
 * @dev_type: device type
 *
 * @returns the vfu_ctx to be used or NULL on error. Sets errno.
 */
vfu_ctx_t *
vfu_create_ctx(vfu_trans_t trans, const char *path,
               int flags, void *pvt, vfu_dev_type_t dev_type);

/*
 * Finalizes the device making it ready for vfu_attach_ctx(). This function is
 * mandatory to be called before vfu_attach_ctx().
 * @vfu_ctx: the libvfio-user context
 *
 * @returns: 0 on success, -1 on error. Sets errno.
 */
int
vfu_realize_ctx(vfu_ctx_t *vfu_ctx);

/*
 * Attempts to attach to the transport. Attach is mandatory before vfu_run_ctx()
 * and is non blocking if context is created with LIBVFIO_USER_FLAG_ATTACH_NB
 * flag.
 *
 * @returns: 0 on success, -1 on error. Sets errno.  If errno is set to EAGAIN
 * or EWOULDBLOCK then the transport is not ready to attach to and the operation
 * must be retried.
 *
 * @vfu_ctx: the libvfio-user context
 */
int
vfu_attach_ctx(vfu_ctx_t *vfu_ctx);

/**
 * Return a file descriptor suitable for waiting on via epoll() or similar. The
 * file descriptor may change after a successful vfu_attach_ctx(), or on
 * receiving ENOTCONN error message from vfu_run_ctx(); in those cases,
 * vfu_get_poll_fd() should be called again to get the current correct file
 * descriptor.
 */
int
vfu_get_poll_fd(vfu_ctx_t *vfu_ctx);

/**
 * Polls the vfu_ctx and processes the command received from client.
 * - Blocking vfu_ctx:
 *   Blocks until new request is received from client and continues processing
 *   the requests. Exits only in case of error or if the client disconnects.
 * - Non-blocking vfu_ctx(LIBVFIO_USER_FLAG_ATTACH_NB):
 *   Processes one request from client if it's available, otherwise it
 *   immediately returns and the caller is responsible for periodically
 *   calling again.
 *
 * @vfu_ctx: The libvfio-user context to poll
 *
 * @returns the number of requests processed (0 or more); or -1 on error,
 *          with errno set as follows:
 *
 * ENOTCONN: client closed connection, vfu_attach_ctx() should be called again
 * EBUSY: the device was asked to quiesce and is still quiescing
 * Other errno values are also possible.
 */
int
vfu_run_ctx(vfu_ctx_t *vfu_ctx);

/**
 * Destroys libvfio-user context. During this call the device must already be
 * in quiesced state; the quiesce callback is not called. Any other device
 * callback can be called.
 *
 * @vfu_ctx: the libvfio-user context to destroy
 */
void
vfu_destroy_ctx(vfu_ctx_t *vfu_ctx);

/**
 * Return the private pointer given to vfu_create_ctx().
 */
void *
vfu_get_private(vfu_ctx_t *vfu_ctx);

/**
 * Callback function signature for log function
 * @vfu_ctx: the libvfio-user context
 * @level: log level as defined in syslog(3)
 * @vfu_log_fn_t: typedef for log function.
 * @msg: message
 */
typedef void (vfu_log_fn_t)(vfu_ctx_t *vfu_ctx, int level, const char *msg);

/**
 * Log to the logging function configured for this context. The format should
 * not include a new line.
 */
void
vfu_log(vfu_ctx_t *vfu_ctx, int level, const char *fmt, ...) \
    __attribute__((format(printf, 3, 4)));

/**
 * Set up logging information.
 * @vfu_ctx: the libvfio-user context
 * @log: logging function
 * @level: logging level as defined in syslog(3)
 *
 * The log handler is expected to add a newline (that is, log messages do not
 * include a newline).
 */
int
vfu_setup_log(vfu_ctx_t *vfu_ctx, vfu_log_fn_t *log, int level);

/**
 * Prototype for region access callback. When a region is accessed, libvfio-user
 * calls the previously registered callback with the following arguments:
 *
 * @vfu_ctx: the libvfio-user context
 * @buf: buffer containing the data to be written or data to be read into
 * @count: number of bytes being read or written
 * @offset: byte offset within the region
 * @is_write: whether or not this is a write
 *
 * @returns the number of bytes read or written, or -1 on error, setting errno.
 */
typedef ssize_t (vfu_region_access_cb_t)(vfu_ctx_t *vfu_ctx, char *buf,
                                         size_t count, loff_t offset,
                                         bool is_write);

#define VFU_REGION_FLAG_READ      (1 << 0)
#define VFU_REGION_FLAG_WRITE     (1 << 1)
#define VFU_REGION_FLAG_RW        (VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE)
/* If unset, this is an IO region. */
#define VFU_REGION_FLAG_MEM       (1 << 2)
#define VFU_REGION_FLAG_ALWAYS_CB (1 << 3)
#define VFU_REGION_FLAG_64_BITS   (1 << 4)
#define VFU_REGION_FLAG_PREFETCH  (1 << 5)
#define VFU_REGION_FLAG_MASK      (VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM | \
                                   VFU_REGION_FLAG_ALWAYS_CB | VFU_REGION_FLAG_64_BITS | \
                                   VFU_REGION_FLAG_PREFETCH)

/**
 * Set up a device region.
 *
 * A region is an area of device memory that can be accessed by the client,
 * either via VFIO_USER_REGION_READ/WRITE, or directly by mapping the region
 * into the client's address space if an fd is given.
 *
 * A mappable region can be split into mappable sub-areas according to the
 * @mmap_areas array. Note that the client can memory map any part of the
 * file descriptor, even if not supposed to do so according to @mmap_areas.
 * There is no way in Linux to avoid this.
 *
 * TODO maybe we should introduce per-sparse region file descriptors so that
 * the client cannot possibly memory map areas it's not supposed to. Even if
 * the client needs to have region under the same backing file, it is possible
 * to create linear device-mapper targets, one for each area, and provide file
 * descriptors of these DM targets. This is something we can document and
 * demonstrate in a sample.
 *
 * Areas that are accessed via such a mapping by definition do not invoke any
 * given callback.  However, the callback can still be invoked, even on a
 * mappable area, if the client chooses to call VFIO_USER_REGION_READ/WRITE.
 *
 * The following regions are special and are explained below:
 *  - VFU_PCI_DEV_CFG_REGION_IDX,
 *  - VFU_PCI_DEV_MIGR_REGION_IDX, and
 *  - VFU_GENERIC_DEV_MIGR_REG_IDX.
 *
 * Region VFU_PCI_DEV_CFG_REGION_IDX, corresponding to PCI config space, has
 * special handling:
 *
 *  - the @size argument is ignored: the region size is always the size defined
 *    by the relevant PCI specification
 *  - all accesses to the standard PCI header (i.e. the first 64 bytes of the
 *    region) are handled by the library
 *  - all accesses to known PCI capabilities (see vfu_pci_add_capability())
 *    are handled by the library
 *  - if no callback is provided, reads to other areas are a simple memcpy(),
 *    and writes are an error
 *  - otherwise, the callback is expected to handle the access
 *  - if VFU_REGION_FLAG_ALWAYS_CB flag is set, all accesses to the config
 *    space are forwarded to the callback
 *
 * Regions VFU_PCI_DEV_MIGR_REGION_IDX and VFU_GENERIC_DEV_MIGR_REG_IDX,
 * corresponding to the migration region, enable live migration support for
 * the device. The migration region must contain at the beginning the migration
 * registers (struct vfio_user_migration_info) and the remaining part of the
 * region can be arbitrarily used by the device implementation. The region
 * provided must have at least vfu_get_migr_register_area_size() bytes available
 * at the start of the region (this size is guaranteed to be page-aligned). If
 * mmap_areas is given, it must _not_ include this part of the region.
 *
 * libvfio-user offers two ways for the migration region to be used:
 *  1. natively: the device implementation must handle accesses to the
 *      migration registers and migration data via the region callbacks. The
 *      semantics of these registers are explained in <linux/vfio.h>.
 *  2. via the vfu_migration_callbacks_t callbacks: the device implementation
 *      registers a set of callbacks by calling vfu_setup_device_migration.
 *      The region's read/write callbacks are never called.
 *
 * @vfu_ctx: the libvfio-user context
 * @region_idx: region index
 * @size: size of the region
 * @region_access: callback function to access region
 * @flags: region flags (VFU_REGION_FLAG_*)
 * @mmap_areas: array of memory mappable areas; if an fd is provided, but this
 * is NULL, then the entire region is mappable.
 * @nr_mmap_areas: number of sparse areas in @mmap_areas; must be provided if
 *  the @mmap_areas is non-NULL, or 0 otherwise.
 * @fd: file descriptor of the file backing the region if the region is
 *  mappable; it is the server's responsibility to create a file suitable for
 *  memory mapping by the client.
 * @offset: offset of the region within the fd, or zero.
 *
 * @returns 0 on success, -1 on error, Sets errno.
 */
int
vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size,
                 vfu_region_access_cb_t *region_access, int flags,
                 struct iovec *mmap_areas, uint32_t nr_mmap_areas,
                 int fd, uint64_t offset);

typedef enum vfu_reset_type {
    /*
     * Client requested a device reset (for example, as part of a guest VM
     * reboot). The vfio-user context remains valid, but it's expected that all
     * ongoing operations are completed or cancelled, and any device state is
     * reset to a known-good initial state (including any PCI register state).
     */
    VFU_RESET_DEVICE,

    /*
     * The vfio-user socket client connection was closed or reset. The attached
     * context is cleaned up after returning from the reset callback, and
     * vfu_attach_ctx() must be called to establish a new client.
     */
    VFU_RESET_LOST_CONN,

    /*
     * Client requested to initiate PCI function level reset.
     */
    VFU_RESET_PCI_FLR
} vfu_reset_type_t;

/*
 * Device callback for quiescing the device.
 *
 * vfu_run_ctx uses this callback to request from the device to quiesce its
 * operation. A quiesced device must not call vfu_addr_to_sgl() or vfu_sgl_*(),
 * unless it does so from a device callback.
 *
 * The callback can return two values:
 * 1) 0: this indicates that the device was quiesced. vfu_run_ctx then continues
 *      to execute and when vfu_run_ctx returns to the caller the device is
 *      unquiesced.
 * 2) -1 with errno set to EBUSY: this indicates that the device cannot
 *      immediately quiesce. In this case, vfu_run_ctx returns -1 with errno
 *      set to EBUSY and future calls to vfu_run_ctx return the same. Until the
 *      device quiesces it can continue operate as normal. The device indicates
 *      that it quiesced by calling vfu_device_quiesced. When
 *      vfu_device_quiesced returns the device is no longer quiesced.
 *
 * A quiesced device should expect for any of the following callbacks to be
 * executed: vfu_dma_register_cb_t, vfu_unregister_cb_t, vfu_reset_cb_t, and
 * the migration transition callback. These callbacks are only called after the
 * device has been quiesced.
 *
 * The following example demonstrates how a device can use the SG routines and
 * friends while quiesced:
 *
 * A DMA region is mapped, libvfio-user calls the quiesce callback but the
 * device cannot immediately quiesce:
 *
 *     int quiesce_cb(vfu_ctx_t *vfu_ctx) {
 *         errno = EBUSY;
 *         return -1;
 *     }
 *
 * While quiescing, the device can continue to operate as normal, including
 * calling functions such as vfu_sgl_get(). Then, the device finishes quiescing:
 *
 *  vfu_quiesce_done(vfu_ctx, 0);
 *
 * At this point, the device must have stopped using functions like
 * vfu_sgl_get(), for example by pausing any I/O threads.  libvfio-user
 * eventually calls the dma_register device callback before vfu_quiesce_done
 * returns. In this callback the device is allowed to call functions such as
 * vfu_sgl_get()
 *
 *     void (dma_register_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) {
 *         vfu_sgl_get(ctx, ...);
 *     }
 *
 * Once vfu_quiesce_done returns, the device is unquiesced.
 *
 *
 * @vfu_ctx: the libvfio-user context
 *
 * @returns: 0 on success, -1 on failure with errno set.
 */
typedef int (vfu_device_quiesce_cb_t)(vfu_ctx_t *vfu_ctx);

/**
 * Sets up the device quiesce callback.
 *
 * @vfu_ctx: the libvfio-user context
 * @quiesce_cb: device quiesce callback
 */
void
vfu_setup_device_quiesce_cb(vfu_ctx_t *vfu_ctx,
                            vfu_device_quiesce_cb_t *quiesce_cb);

/*
 * Called by the device to complete a pending quiesce operation. After the
 * function returns the device is unquiesced.
 *
 * @vfu_ctx: the libvfio-user context
 * @quiesce_errno: 0 for success or errno in case the device fails to quiesce,
 *                 in which case the operation requiring the quiesce is failed
 *                 and the device is reset.
 *
 * @returns 0 on success, or -1 on failure. Sets errno.
 */
int
vfu_device_quiesced(vfu_ctx_t *vfu_ctx, int quiesce_errno);

/*
 * Callback function that is called when the device must be reset.
 */
typedef int (vfu_reset_cb_t)(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type);

/**
 * Set up device reset callback.
 *
 * A reset should ensure that all on-going use of device IRQs or guest memory is
 * completed or cancelled before returning from the callback.
 *
 * @vfu_ctx: the libvfio-user context
 * @reset: device reset callback
 */
int
vfu_setup_device_reset_cb(vfu_ctx_t *vfu_ctx, vfu_reset_cb_t *reset);

/*
 * Info for a guest DMA region.  @iova is always valid; the other parameters
 * will only be set if the guest DMA region is mappable.
 *
 * @iova: guest DMA range. This is the guest physical range (as we don't
 *   support vIOMMU) that the guest registers for DMA, via a VFIO_USER_DMA_MAP
 *   message, and is the address space used as input to vfu_addr_to_sgl().
 * @vaddr: if the range is mapped into this process, this is the virtual address
 *   of the start of the region.
 * @mapping: if @vaddr is non-NULL, this range represents the actual range
 *   mmap()ed into the process. This might be (large) page aligned, and
 *   therefore be different from @vaddr + @iova.iov_len.
 * @page_size: if @vaddr is non-NULL, page size of the mapping (e.g. 2MB)
 * @prot: if @vaddr is non-NULL, protection settings of the mapping as per
 *   mmap(2)
 *
 * For a real example, using the gpio sample server, and a qemu configured to
 * use huge pages and share its memory:
 *
 * gpio: mapped DMA region iova=[0xf0000-0x10000000) vaddr=0x2aaaab0f0000
 * page_size=0x200000 mapping=[0x2aaaab000000-0x2aaabb000000)
 *
 *     0xf0000                    0x10000000
 *     |                                   |
 *     v                                   v
 *     +-----------------------------------+
 *     | Guest IOVA (DMA) space            |
 *  +--+-----------------------------------+--+
 *  |  |                                   |  |
 *  |  +-----------------------------------+  |
 *  |  ^ libvfio-user server address space    |
 *  +--|--------------------------------------+
 *  ^ vaddr=0x2aaaab0f0000                    ^
 *  |                                         |
 *  0x2aaaab000000               0x2aaabb000000
 *
 * This region can be directly accessed at 0x2aaaab0f0000, but the underlying
 * large page mapping is in the range [0x2aaaab000000-0x2aaabb000000).
 */
typedef struct vfu_dma_info {
    struct iovec iova;
    void *vaddr;
    struct iovec mapping;
    size_t page_size;
    uint32_t prot;
} vfu_dma_info_t;

/*
 * Called when a guest registers one of its DMA regions via a VFIO_USER_DMA_MAP
 * message.
 *
 * @vfu_ctx: the libvfio-user context
 * @info: the DMA info
 */
typedef void (vfu_dma_register_cb_t)(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info);

/*
 * Function that is called when the guest unregisters a DMA region.  This
 * callback is required if you want to be able to access guest memory directly
 * via a mapping. The device must release all references to that region before
 * the callback returns.
 *
 * @vfu_ctx: the libvfio-user context
 * @info: the DMA info
 */
typedef void (vfu_dma_unregister_cb_t)(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info);

/**
 * Set up device DMA registration callbacks. When libvfio-user is notified of a
 * DMA range addition or removal, these callbacks will be invoked.
 *
 * If this function is not called, guest DMA regions are not accessible via
 * vfu_addr_to_sgl().
 *
 * To directly access this DMA memory via a local mapping with vfu_sgl_get(), at
 * least @dma_unregister must be provided.
 *
 * @vfu_ctx: the libvfio-user context
 * @dma_register: DMA region registration callback (optional)
 * @dma_unregister: DMA region unregistration callback (optional)
 */

int
vfu_setup_device_dma(vfu_ctx_t *vfu_ctx, vfu_dma_register_cb_t *dma_register,
                     vfu_dma_unregister_cb_t *dma_unregister);

enum vfu_dev_irq_type {
    VFU_DEV_INTX_IRQ,
    VFU_DEV_MSI_IRQ,
    VFU_DEV_MSIX_IRQ,
    VFU_DEV_ERR_IRQ,
    VFU_DEV_REQ_IRQ,
    VFU_DEV_NUM_IRQS
};

/**
 * Set up device IRQ counts.
 * @vfu_ctx: the libvfio-user context
 * @type: IRQ type (VFU_DEV_INTX_IRQ ... VFU_DEV_REQ_IRQ)
 * @count: number of irqs
 */
int
vfu_setup_device_nr_irqs(vfu_ctx_t *vfu_ctx, enum vfu_dev_irq_type type,
                         uint32_t count);

/*
 * Function that is called when the guest masks or unmasks an IRQ vector.
 *
 * @vfu_ctx: the libvfio-user context
 * @start: starting IRQ vector
 * @count: number of vectors
 * @mask: indicates if the IRQ is masked or unmasked
 */
typedef void (vfu_dev_irq_state_cb_t)(vfu_ctx_t *vfu_ctx, uint32_t start,
                                      uint32_t count, bool mask);

/**
 * Set up IRQ state change callback. When libvfio-user is notified of a
 * change to IRQ state, whether masked or unmasked, it invokes
 * this callback.
 *
 * @vfu_ctx: the libvfio-user context
 * @type: IRQ type such as VFU_DEV_MSIX_IRQ - defined by vfu_dev_irq_type
 * @cb: IRQ state change callback
 *
 * @returns 0 on success, -1 on error, sets errno.
 */
int
vfu_setup_irq_state_callback(vfu_ctx_t *vfu_ctx, enum vfu_dev_irq_type type,
                             vfu_dev_irq_state_cb_t *cb);

typedef enum {
    VFU_MIGR_STATE_STOP,
    VFU_MIGR_STATE_RUNNING,
    VFU_MIGR_STATE_STOP_AND_COPY,
    VFU_MIGR_STATE_PRE_COPY,
    VFU_MIGR_STATE_RESUME
} vfu_migr_state_t;

#define VFU_MIGR_CALLBACKS_VERS 2

typedef struct {

    /*
     * Set it to VFU_MIGR_CALLBACKS_VERS.
     */
    int version;

    /*
     * Migration state transition callback.
     *
     * The callback should return -1 on error, setting errno.
     *
     *
     * TODO rename to vfu_migration_state_transition_callback
     * FIXME maybe we should create a single callback and pass the state?
     */
    int (*transition)(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state);
    
    /*
     * Function that is called to read `count` bytes of migration data into
     * `buf`. The function must return the amount of data read or -1 on error,
     * setting errno. The function may return less data than requested.
     *
     * If the function returns zero, this is interpreted to mean that there is
     * no more migration data to read.
     */
    ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count);

    /*
     * Function that is called for writing previously stored device state. The
     * function must return the amount of data written or -1 on error, setting
     * errno. Partial writes are not supported, so any return value other than
     * `count` is invalid.
     */
    ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count);

} vfu_migration_callbacks_t;

int
vfu_setup_device_migration_callbacks(vfu_ctx_t *vfu_ctx,
    const vfu_migration_callbacks_t *callbacks);

/**
 * Triggers an interrupt.
 *
 * libvfio-user takes care of using the correct IRQ type (IRQ index: INTx or
 * MSI/X), the caller only needs to specify the sub-index.
 *
 * @vfu_ctx: the libvfio-user context to trigger interrupt
 * @subindex: vector subindex to trigger interrupt on
 *
 * @returns 0 on success, or -1 on failure. Sets errno.
 */
int
vfu_irq_trigger(vfu_ctx_t *vfu_ctx, uint32_t subindex);

/**
 * Takes a guest physical address range and populates an array of scatter/gather
 * entries than can be individually mapped in the program's virtual memory.  A
 * single linear guest physical address span may need to be split into multiple
 * scatter/gather regions due to limitations of how memory can be mapped.
 *
 * vfu_setup_device_dma() must have been called prior to using this function.
 *
 * @vfu_ctx: the libvfio-user context
 * @dma_addr: the guest physical address
 * @len: size of memory to be mapped
 * @sgl: array that receives the scatter/gather entries to be mapped
 * @max_nr_sgs: maximum number of elements in above array
 * @prot: protection as defined in <sys/mman.h>
 *
 * @returns the number of scatter/gather entries created on success, and on
 * failure:
 *  -1:         if the GPA address span is invalid (errno=ENOENT) or
 *              protection violation (errno=EACCES)
 *  (-x - 1):   if @max_nr_sgs is too small, where x is the number of SG
 *              entries necessary to complete this request (errno=0).
 */
int
vfu_addr_to_sgl(vfu_ctx_t *vfu_ctx, vfu_dma_addr_t dma_addr, size_t len,
                dma_sg_t *sgl, size_t max_nr_sgs, int prot);

/**
 * Populate the given iovec array (accessible in the process's virtual memory),
 * based upon the SGL previously built via vfu_addr_to_sgl().
 * It is the caller's responsibility to return the release the iovecs via
 * vfu_sgl_put().
 *
 * This is only supported when a @dma_unregister callback is provided to
 * vfu_setup_device_dma().
 *
 * @vfu_ctx: the libvfio-user context
 * @sgl: array of scatter/gather entries returned by vfu_addr_to_sg. These
 *       entries must not be modified and the array must not be deallocated
 *       until vfu_sgl_put() has been called.
 * @iov: array of iovec structures (defined in <sys/uio.h>) to receive each
 *       mapping
 * @cnt: number of scatter/gather entries to map
 * @flags: must be 0
 *
 * @returns 0 on success, -1 on failure. Sets errno.
 */
int
vfu_sgl_get(vfu_ctx_t *vfu_ctx, dma_sg_t *sgl, struct iovec *iov, size_t cnt,
            int flags);

/**
 * Mark scatter/gather entries (previously acquired via vfu_sgl_get())
 * as dirty (written to). This is only necessary if vfu_sgl_put() is not called.
 *
 * @vfu_ctx: the libvfio-user context
 * @sg: array of scatter/gather entries to mark as dirty
 * @cnt: number of scatter/gather entries to mark as dirty
 */
void
vfu_sgl_mark_dirty(vfu_ctx_t *vfu_ctx, dma_sg_t *sgl, size_t cnt);

/**
 * Release the iovec array previously acquired by vfu_sgl_get().
 *
 * This will automatically mark the sgl as dirty if needed.
 *
 * @vfu_ctx: the libvfio-user context
 * @sgl: array of scatter/gather entries to unmap
 * @iov: array of iovec structures for each scatter/gather entry
 * @cnt: number of scatter/gather entries to unmap
 */
void
vfu_sgl_put(vfu_ctx_t *vfu_ctx, dma_sg_t *sgl, struct iovec *iov, size_t cnt);

/**
 * Read from the dma region exposed by the client. This can be used as an
 * alternative to reading from a vfu_sgl_get() mapping, if the region is not
 * directly mappable, or DMA notification callbacks have not been provided.
 *
 * @vfu_ctx: the libvfio-user context
 * @sg: a DMA segment obtained from dma_addr_to_sg
 * @data: data buffer to read into
 *
 * @returns 0 on success, -1 on failure. Sets errno.
 */
int
vfu_sgl_read(vfu_ctx_t *vfu_ctx, dma_sg_t *sg, size_t cnt, void *data);

/**
 * Write to the dma region exposed by the client. This can be used as an
 * alternative to reading from a vfu_sgl_get() mapping, if the region is not
 * directly mappable, or DMA notification callbacks have not been provided.
 *
 * During live migration, this call does not mark any of the written pages as
 * dirty; the client is expected to track this.
 *
 * @vfu_ctx: the libvfio-user context
 * @sg: a DMA segment obtained from dma_addr_to_sg
 * @data: data buffer to write
 *
 * @returns 0 on success, -1 on failure. Sets errno.
 */
int
vfu_sgl_write(vfu_ctx_t *vfu_ctx, dma_sg_t *sg, size_t cnt, void *data);

/*
 * Supported PCI regions.
 *
 * Note: in VFIO, each region starts at a terabyte offset
 * (VFIO_PCI_INDEX_TO_OFFSET) and because Linux supports up to 128 TB of user
 * space virtual memory, there can be up to 128 device regions. PCI regions are
 * fixed and in retrospect this choice has proven to be problematic because
 * devices might contain potentially unused regions. New regions can now be
 * positioned anywhere by using the VFIO_REGION_INFO_CAP_TYPE capability.  In
 * vfio-user we don't have this problem because the region index is just an
 * identifier: the VMM memory maps a file descriptor that is passed to it and
 * the mapping offset is derived from the mmap_areas offset value, rather than a
 * static mapping from region index to offset. Thus, additional regions can
 * have static indexes in vfio-user.
 */
enum {
    VFU_PCI_DEV_BAR0_REGION_IDX,
    VFU_PCI_DEV_BAR1_REGION_IDX,
    VFU_PCI_DEV_BAR2_REGION_IDX,
    VFU_PCI_DEV_BAR3_REGION_IDX,
    VFU_PCI_DEV_BAR4_REGION_IDX,
    VFU_PCI_DEV_BAR5_REGION_IDX,
    VFU_PCI_DEV_ROM_REGION_IDX,
    VFU_PCI_DEV_CFG_REGION_IDX,
    VFU_PCI_DEV_VGA_REGION_IDX,
    VFU_PCI_DEV_NUM_REGIONS,
};

typedef enum {
    VFU_PCI_TYPE_CONVENTIONAL,
    VFU_PCI_TYPE_PCI_X_1,
    VFU_PCI_TYPE_PCI_X_2,
    VFU_PCI_TYPE_EXPRESS
} vfu_pci_type_t;

enum {
    VFU_GENERIC_DEV_MIGR_REGION_IDX,
    VFU_GENERIC_DEV_NUM_REGIONS
};

/**
 * Initialize the context for a PCI device. This function must be called only
 * once per libvfio-user context.
 *
 * This function initializes a buffer for the PCI config space, accessible via
 * vfu_pci_get_config_space().
 *
 * Returns 0 on success, or -1 on error, setting errno.
 *
 * @vfu_ctx: the libvfio-user context
 * @pci_type: PCI type (convention PCI, PCI-X mode 1, PCI-X mode2, PCI-Express)
 * @hdr_type: PCI header type. Only PCI_HEADER_TYPE_NORMAL is supported.
 * @revision: PCI/PCI-X/PCIe revision
 */
int
vfu_pci_init(vfu_ctx_t *vfu_ctx, vfu_pci_type_t pci_type,
             int hdr_type, int revision);

/*
 * Set the Vendor ID, Device ID, Subsystem Vendor ID, and Subsystem ID fields of
 * the PCI config header (PCI3 6.2.1, 6.2.4).
 *
 * This must always be called for PCI devices, after vfu_pci_init().
 */
void
vfu_pci_set_id(vfu_ctx_t *vfu_ctx, uint16_t vid, uint16_t did,
               uint16_t ssvid, uint16_t ssid);

/*
 * Set the class code fields (base, sub-class, and programming interface) of the
 * PCI config header (PCI3 6.2.1).
 *
 * If this function is not called, the fields are initialized to zero.
 */
void
vfu_pci_set_class(vfu_ctx_t *vfu_ctx, uint8_t base, uint8_t sub, uint8_t pi);


/*
 * Returns a pointer to the PCI configuration space.
 *
 * PCI config space consists of an initial 64-byte vfu_pci_hdr_t, plus
 * additional space, containing capabilities and/or device-specific
 * configuration.  Standard config space is 256 bytes (PCI_CFG_SPACE_SIZE);
 * extended config space is 4096 bytes (PCI_CFG_SPACE_EXP_SIZE).
 */
vfu_pci_config_space_t *
vfu_pci_get_config_space(vfu_ctx_t *vfu_ctx);

#define VFU_CAP_FLAG_EXTENDED (1 << 0)
#define VFU_CAP_FLAG_CALLBACK (1 << 1)
#define VFU_CAP_FLAG_READONLY (1 << 2)

/**
 * Add a PCI capability to PCI config space.
 *
 * Certain standard capabilities are handled entirely within the library:
 *
 * PCI_CAP_ID_EXP (pxcap)
 * PCI_CAP_ID_MSI (msicap)
 * PCI_CAP_ID_MSIX (msixcap)
 * PCI_CAP_ID_PM (pmcap)
 *
 * However, they must still be explicitly initialized and added here.
 *
 * The contents of @data are copied in. It must start with either a struct
 * cap_hdr or a struct ext_cap_hdr, with the ID field set; the 'next' field is
 * ignored.  For PCI_CAP_ID_VNDR or PCI_EXT_CAP_ID_VNDR, the embedded size field
 * must also be set; in general, any non-fixed-size capability must be
 * initialized such that the size can be derived at this point.
 *
 * If @pos is non-zero, the capability will be placed at the given offset within
 * configuration space. It must not overlap the PCI standard header, or any
 * existing capability. Note that if a capability is added "out of order" in
 * terms of the offset, there is no re-ordering of the capability list written
 * in configuration space.
 *
 * If @pos is zero, the capability will be placed at a suitable offset
 * automatically.
 *
 * The @flags field can be set as follows:
 *
 * VFU_CAP_FLAG_EXTENDED: this is an extended capability; supported if device is
 * of PCI type VFU_PCI_TYPE_{PCI_X_2,EXPRESS}.
 *
 * VFU_CAP_FLAG_CALLBACK: all accesses to the capability are delegated to the
 * callback for the region VFU_PCI_DEV_CFG_REGION_IDX. The callback should copy
 * data into and out of the capability as needed (this could be directly on the
 * config space area from vfu_pci_get_config_space()). It is not supported to
 * allow writes to the initial capability header (ID/next fields).
 *
 * VFU_CAP_FLAG_READONLY: this prevents clients from writing to the capability.
 * By default, clients are allowed to write to any part of the capability,
 * excluding the initial header.
 *
 * Returns the offset of the capability in config space, or -1 on error, with
 * errno set.
 *
 * @vfu_ctx: the libvfio-user context
 * @pos: specific offset for the capability, or 0.
 * @flags: VFU_CAP_FLAG_*
 * @data: capability data, including the header
 */
ssize_t
vfu_pci_add_capability(vfu_ctx_t *vfu_ctx, size_t pos, int flags, void *data);

/**
 * Find the offset within config space of a given capability (if there are
 * multiple possible matches, use vfu_pci_find_next_capability()).
 *
 * Returns 0 if no such capability was found, with errno set.
 *
 * @vfu_ctx: the libvfio-user context
 * @extended whether capability is an extended one or not
 * @id: capability id (PCI_CAP_ID_* or PCI_EXT_CAP_ID *)
 */
size_t
vfu_pci_find_capability(vfu_ctx_t *vfu_ctx, bool extended, int cap_id);

/**
 * Find the offset within config space of the given capability, starting from
 * @pos, which must be the valid offset of an existing capability. This can be
 * used to iterate through multiple capabilities with the same ID.
 *
 * Returns 0 if no more matching capabilities were found, with errno set.
 *
 * @vfu_ctx: the libvfio-user context
 * @extended whether capability is an extended one or not
 * @pos: offset within config space to start looking
 * @id: capability id (PCI_CAP_ID_*)
 */
size_t
vfu_pci_find_next_capability(vfu_ctx_t *vfu_ctx, bool extended,
                             size_t pos, int cap_id);

bool
vfu_sg_is_mappable(vfu_ctx_t *vfu_ctx, dma_sg_t *sg);

/*
 * Creates a new ioeventfd at the given setup memory region with @offset, @size,
 * @fd, @flags and @datamatch.
 *
 * Returns 0 on success and -1 on failure with errno set.
 *
 * @vfu_ctx: the libvfio-user context
 * @region_idx: The index of the memory region to set up the ioeventfd
 * @fd: the value of the file descriptor
 * @gpa_offset: The offset into the memory region
 * @size: size of the ioeventfd
 * @flags: Any flags to set up the ioeventfd
 * @datamatch: sets the datamatch value
 * @shadow_fd: File descriptor that can be mmap'ed, KVM will write there the
 *  otherwise discarded value when the ioeventfd is written to. If set to -1
 *  then a normal ioeventfd is set up instead of a shadow one. The vfio-user
 *  client is free to ignore this, even if it supports shadow ioeventfds.
 *  Requires a kernel with shadow ioeventfd support.
 *  Experimental, must be compiled with SHADOW_IOEVENTFD defined, otherwise
 *  must be -1.
 * @shadow_offset: offset in shadow memory where value is written to.
 */
int
vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd,
                     size_t gpa_offset, uint32_t size, uint32_t flags,
                     uint64_t datamatch, int shadow_fd, size_t shadow_offset);
#ifdef __cplusplus
}
#endif

#endif /* LIB_VFIO_USER_H */

/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */