diff options
Diffstat (limited to 'include')
52 files changed, 3979 insertions, 2744 deletions
diff --git a/include/block/block-common.h b/include/block/block-common.h new file mode 100644 index 0000000..fdb7306 --- /dev/null +++ b/include/block/block-common.h @@ -0,0 +1,419 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCK_COMMON_H +#define BLOCK_COMMON_H + +#include "block/aio.h" +#include "block/aio-wait.h" +#include "qemu/iov.h" +#include "qemu/coroutine.h" +#include "block/accounting.h" +#include "block/dirty-bitmap.h" +#include "block/blockjob.h" +#include "qemu/hbitmap.h" +#include "qemu/transactions.h" + +/* + * generated_co_wrapper + * + * Function specifier, which does nothing but mark functions to be + * generated by scripts/block-coroutine-wrapper.py + * + * Read more in docs/devel/block-coroutine-wrapper.rst + */ +#define generated_co_wrapper + +/* block.c */ +typedef struct BlockDriver BlockDriver; +typedef struct BdrvChild BdrvChild; +typedef struct BdrvChildClass BdrvChildClass; + +typedef struct BlockDriverInfo { + /* in bytes, 0 if irrelevant */ + int cluster_size; + /* offset at which the VM state can be saved (0 if not possible) */ + int64_t vm_state_offset; + bool is_dirty; + /* + * True if this block driver only supports compressed writes + */ + bool needs_compressed_writes; +} BlockDriverInfo; + +typedef struct BlockFragInfo { + uint64_t allocated_clusters; + uint64_t total_clusters; + uint64_t fragmented_clusters; + uint64_t compressed_clusters; +} BlockFragInfo; + +typedef enum { + BDRV_REQ_COPY_ON_READ = 0x1, + BDRV_REQ_ZERO_WRITE = 0x2, + + /* + * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate + * that the block driver should unmap (discard) blocks if it is guaranteed + * that the result will read back as zeroes. The flag is only passed to the + * driver if the block device is opened with BDRV_O_UNMAP. + */ + BDRV_REQ_MAY_UNMAP = 0x4, + + BDRV_REQ_FUA = 0x10, + BDRV_REQ_WRITE_COMPRESSED = 0x20, + + /* + * Signifies that this write request will not change the visible disk + * content. + */ + BDRV_REQ_WRITE_UNCHANGED = 0x40, + + /* + * Forces request serialisation. Use only with write requests. + */ + BDRV_REQ_SERIALISING = 0x80, + + /* + * Execute the request only if the operation can be offloaded or otherwise + * be executed efficiently, but return an error instead of using a slow + * fallback. + */ + BDRV_REQ_NO_FALLBACK = 0x100, + + /* + * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read + * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR + * filter is involved), in which case it signals that the COR operation + * need not read the data into memory (qiov) but only ensure they are + * copied to the top layer (i.e., that COR operation is done). + */ + BDRV_REQ_PREFETCH = 0x200, + + /* + * If we need to wait for other requests, just fail immediately. Used + * only together with BDRV_REQ_SERIALISING. Used only with requests aligned + * to request_alignment (corresponding assertions are in block/io.c). + */ + BDRV_REQ_NO_WAIT = 0x400, + + /* Mask of valid flags */ + BDRV_REQ_MASK = 0x7ff, +} BdrvRequestFlags; + +#define BDRV_O_NO_SHARE 0x0001 /* don't share permissions */ +#define BDRV_O_RDWR 0x0002 +#define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */ +#define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save + writes in a snapshot */ +#define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */ +#define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */ +#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the + thread pool */ +#define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */ +#define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */ +#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */ +#define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */ +#define BDRV_O_CHECK 0x1000 /* open solely for consistency check */ +#define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */ +#define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */ +#define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given: + select an appropriate protocol driver, + ignoring the format layer */ +#define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */ +#define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening + read-write fails */ +#define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */ + +#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) + + +/* Option names of options parsed by the block layer */ + +#define BDRV_OPT_CACHE_WB "cache.writeback" +#define BDRV_OPT_CACHE_DIRECT "cache.direct" +#define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush" +#define BDRV_OPT_READ_ONLY "read-only" +#define BDRV_OPT_AUTO_READ_ONLY "auto-read-only" +#define BDRV_OPT_DISCARD "discard" +#define BDRV_OPT_FORCE_SHARE "force-share" + + +#define BDRV_SECTOR_BITS 9 +#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) + +#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ + INT_MAX >> BDRV_SECTOR_BITS) +#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) + +/* + * We want allow aligning requests and disk length up to any 32bit alignment + * and don't afraid of overflow. + * To achieve it, and in the same time use some pretty number as maximum disk + * size, let's define maximum "length" (a limit for any offset/bytes request and + * for disk size) to be the greatest power of 2 less than INT64_MAX. + */ +#define BDRV_MAX_ALIGNMENT (1L << 30) +#define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT)) + +/* + * Allocation status flags for bdrv_block_status() and friends. + * + * Public flags: + * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer + * BDRV_BLOCK_ZERO: offset reads as zero + * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data + * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this + * layer rather than any backing, set by block layer + * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this + * layer, set by block layer + * + * Internal flags: + * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request + * that the block layer recompute the answer from the returned + * BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID. + * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for + * zeroes in file child of current block node inside + * returned region. Only valid together with both + * BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not + * appear with BDRV_BLOCK_ZERO. + * + * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the + * host offset within the returned BDS that is allocated for the + * corresponding raw guest data. However, whether that offset + * actually contains data also depends on BDRV_BLOCK_DATA, as follows: + * + * DATA ZERO OFFSET_VALID + * t t t sectors read as zero, returned file is zero at offset + * t f t sectors read as valid from file at offset + * f t t sectors preallocated, read as zero, returned file not + * necessarily zero at offset + * f f t sectors preallocated but read from backing_hd, + * returned file contains garbage at offset + * t t f sectors preallocated, read as zero, unknown offset + * t f f sectors read from unknown file or offset + * f t f not allocated or unknown offset, read as zero + * f f f not allocated or unknown offset, read from backing_hd + */ +#define BDRV_BLOCK_DATA 0x01 +#define BDRV_BLOCK_ZERO 0x02 +#define BDRV_BLOCK_OFFSET_VALID 0x04 +#define BDRV_BLOCK_RAW 0x08 +#define BDRV_BLOCK_ALLOCATED 0x10 +#define BDRV_BLOCK_EOF 0x20 +#define BDRV_BLOCK_RECURSE 0x40 + +typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue; + +typedef struct BDRVReopenState { + BlockDriverState *bs; + int flags; + BlockdevDetectZeroesOptions detect_zeroes; + bool backing_missing; + BlockDriverState *old_backing_bs; /* keep pointer for permissions update */ + BlockDriverState *old_file_bs; /* keep pointer for permissions update */ + QDict *options; + QDict *explicit_options; + void *opaque; +} BDRVReopenState; + +/* + * Block operation types + */ +typedef enum BlockOpType { + BLOCK_OP_TYPE_BACKUP_SOURCE, + BLOCK_OP_TYPE_BACKUP_TARGET, + BLOCK_OP_TYPE_CHANGE, + BLOCK_OP_TYPE_COMMIT_SOURCE, + BLOCK_OP_TYPE_COMMIT_TARGET, + BLOCK_OP_TYPE_DATAPLANE, + BLOCK_OP_TYPE_DRIVE_DEL, + BLOCK_OP_TYPE_EJECT, + BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT, + BLOCK_OP_TYPE_INTERNAL_SNAPSHOT, + BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE, + BLOCK_OP_TYPE_MIRROR_SOURCE, + BLOCK_OP_TYPE_MIRROR_TARGET, + BLOCK_OP_TYPE_RESIZE, + BLOCK_OP_TYPE_STREAM, + BLOCK_OP_TYPE_REPLACE, + BLOCK_OP_TYPE_MAX, +} BlockOpType; + +/* Block node permission constants */ +enum { + /** + * A user that has the "permission" of consistent reads is guaranteed that + * their view of the contents of the block device is complete and + * self-consistent, representing the contents of a disk at a specific + * point. + * + * For most block devices (including their backing files) this is true, but + * the property cannot be maintained in a few situations like for + * intermediate nodes of a commit block job. + */ + BLK_PERM_CONSISTENT_READ = 0x01, + + /** This permission is required to change the visible disk contents. */ + BLK_PERM_WRITE = 0x02, + + /** + * This permission (which is weaker than BLK_PERM_WRITE) is both enough and + * required for writes to the block node when the caller promises that + * the visible disk content doesn't change. + * + * As the BLK_PERM_WRITE permission is strictly stronger, either is + * sufficient to perform an unchanging write. + */ + BLK_PERM_WRITE_UNCHANGED = 0x04, + + /** This permission is required to change the size of a block node. */ + BLK_PERM_RESIZE = 0x08, + + /** + * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU + * 6.1 and earlier may still lock the corresponding byte in block/file-posix + * locking. So, implementing some new permission should be very careful to + * not interfere with this old unused thing. + */ + + BLK_PERM_ALL = 0x0f, + + DEFAULT_PERM_PASSTHROUGH = BLK_PERM_CONSISTENT_READ + | BLK_PERM_WRITE + | BLK_PERM_WRITE_UNCHANGED + | BLK_PERM_RESIZE, + + DEFAULT_PERM_UNCHANGED = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH, +}; + +/* + * Flags that parent nodes assign to child nodes to specify what kind of + * role(s) they take. + * + * At least one of DATA, METADATA, FILTERED, or COW must be set for + * every child. + */ +enum BdrvChildRoleBits { + /* + * This child stores data. + * Any node may have an arbitrary number of such children. + */ + BDRV_CHILD_DATA = (1 << 0), + + /* + * This child stores metadata. + * Any node may have an arbitrary number of metadata-storing + * children. + */ + BDRV_CHILD_METADATA = (1 << 1), + + /* + * A child that always presents exactly the same visible data as + * the parent, e.g. by virtue of the parent forwarding all reads + * and writes. + * This flag is mutually exclusive with DATA, METADATA, and COW. + * Any node may have at most one filtered child at a time. + */ + BDRV_CHILD_FILTERED = (1 << 2), + + /* + * Child from which to read all data that isn't allocated in the + * parent (i.e., the backing child); such data is copied to the + * parent through COW (and optionally COR). + * This field is mutually exclusive with DATA, METADATA, and + * FILTERED. + * Any node may have at most one such backing child at a time. + */ + BDRV_CHILD_COW = (1 << 3), + + /* + * The primary child. For most drivers, this is the child whose + * filename applies best to the parent node. + * Any node may have at most one primary child at a time. + */ + BDRV_CHILD_PRIMARY = (1 << 4), + + /* Useful combination of flags */ + BDRV_CHILD_IMAGE = BDRV_CHILD_DATA + | BDRV_CHILD_METADATA + | BDRV_CHILD_PRIMARY, +}; + +/* Mask of BdrvChildRoleBits values */ +typedef unsigned int BdrvChildRole; + +typedef struct BdrvCheckResult { + int corruptions; + int leaks; + int check_errors; + int corruptions_fixed; + int leaks_fixed; + int64_t image_end_offset; + BlockFragInfo bfi; +} BdrvCheckResult; + +typedef enum { + BDRV_FIX_LEAKS = 1, + BDRV_FIX_ERRORS = 2, +} BdrvCheckMode; + +typedef struct BlockSizes { + uint32_t phys; + uint32_t log; +} BlockSizes; + +typedef struct HDGeometry { + uint32_t heads; + uint32_t sectors; + uint32_t cylinders; +} HDGeometry; + +/* + * Common functions that are neither I/O nor Global State. + * + * These functions must never call any function from other categories + * (I/O, "I/O or GS", Global State) except this one, but can be invoked by + * all of them. + */ + +char *bdrv_perm_names(uint64_t perm); +uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm); + +void bdrv_init_with_whitelist(void); +bool bdrv_uses_whitelist(void); +int bdrv_is_whitelisted(BlockDriver *drv, bool read_only); + +int bdrv_parse_aio(const char *mode, int *flags); +int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); +int bdrv_parse_discard_flags(const char *mode, int *flags); + +int path_has_protocol(const char *path); +int path_is_absolute(const char *path); +char *path_combine(const char *base_path, const char *filename); + +char *bdrv_get_full_backing_filename_from_filename(const char *backed, + const char *backing, + Error **errp); + +#endif /* BLOCK_COMMON_H */ diff --git a/include/block/block-copy.h b/include/block/block-copy.h index 99370fa..68bbd34 100644 --- a/include/block/block-copy.h +++ b/include/block/block-copy.h @@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState; typedef struct BlockCopyCallState BlockCopyCallState; BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, + const BdrvDirtyBitmap *bitmap, Error **errp); /* Function should be called prior any actual copy request */ @@ -34,6 +35,7 @@ void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm); void block_copy_state_free(BlockCopyState *s); +void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes); int64_t block_copy_reset_unallocated(BlockCopyState *s, int64_t offset, int64_t *count); diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h new file mode 100644 index 0000000..25bb69b --- /dev/null +++ b/include/block/block-global-state.h @@ -0,0 +1,253 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCK_GLOBAL_STATE_H +#define BLOCK_GLOBAL_STATE_H + +#include "block-common.h" + +/* + * Global state (GS) API. These functions run under the BQL. + * + * If a function modifies the graph, it also uses drain and/or + * aio_context_acquire/release to be sure it has unique access. + * aio_context locking is needed together with BQL because of + * the thread-safe I/O API that concurrently runs and accesses + * the graph without the BQL. + * + * It is important to note that not all of these functions are + * necessarily limited to running under the BQL, but they would + * require additional auditing and many small thread-safety changes + * to move them into the I/O API. Often it's not worth doing that + * work since the APIs are only used with the BQL held at the + * moment, so they have been placed in the GS API (for now). + * + * These functions can call any function from this and other categories + * (I/O, "I/O or GS", Common), but must be invoked only by other GS APIs. + * + * All functions in this header must use the macro + * GLOBAL_STATE_CODE(); + * to catch when they are accidentally called without the BQL. + */ + +void bdrv_init(void); +BlockDriver *bdrv_find_protocol(const char *filename, + bool allow_protocol_prefix, + Error **errp); +BlockDriver *bdrv_find_format(const char *format_name); +int bdrv_create(BlockDriver *drv, const char* filename, + QemuOpts *opts, Error **errp); +int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp); + +BlockDriverState *bdrv_new(void); +int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, + Error **errp); +int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, + Error **errp); +int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs, + Error **errp); +BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, + int flags, Error **errp); +int bdrv_drop_filter(BlockDriverState *bs, Error **errp); + +BdrvChild *bdrv_open_child(const char *filename, + QDict *options, const char *bdref_key, + BlockDriverState *parent, + const BdrvChildClass *child_class, + BdrvChildRole child_role, + bool allow_none, Error **errp); +BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp); +int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, + Error **errp); +int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, + const char *bdref_key, Error **errp); +BlockDriverState *bdrv_open(const char *filename, const char *reference, + QDict *options, int flags, Error **errp); +BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv, + const char *node_name, + QDict *options, int flags, + Error **errp); +BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name, + int flags, Error **errp); +BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, + BlockDriverState *bs, QDict *options, + bool keep_old_opts); +void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue); +int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp); +int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts, + Error **errp); +int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, + Error **errp); +BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + const char *backing_file); +void bdrv_refresh_filename(BlockDriverState *bs); +void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp); +int bdrv_commit(BlockDriverState *bs); +int bdrv_make_empty(BdrvChild *c, Error **errp); +int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file, + const char *backing_fmt, bool warn); +void bdrv_register(BlockDriver *bdrv); +int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, + const char *backing_file_str); +BlockDriverState *bdrv_find_overlay(BlockDriverState *active, + BlockDriverState *bs); +BlockDriverState *bdrv_find_base(BlockDriverState *bs); +bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base, + Error **errp); +int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base, + Error **errp); +void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base); + +/* + * The units of offset and total_work_size may be chosen arbitrarily by the + * block driver; total_work_size may change during the course of the amendment + * operation + */ +typedef void BlockDriverAmendStatusCB(BlockDriverState *bs, int64_t offset, + int64_t total_work_size, void *opaque); +int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb, void *cb_opaque, + bool force, + Error **errp); + +/* check if a named node can be replaced when doing drive-mirror */ +BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, + const char *node_name, Error **errp); + +int bdrv_activate(BlockDriverState *bs, Error **errp); +void bdrv_activate_all(Error **errp); +int bdrv_inactivate_all(void); + +int bdrv_flush_all(void); +void bdrv_close_all(void); +void bdrv_drain_all_begin(void); +void bdrv_drain_all_end(void); +void bdrv_drain_all(void); + +int bdrv_has_zero_init_1(BlockDriverState *bs); +int bdrv_has_zero_init(BlockDriverState *bs); +BlockDriverState *bdrv_find_node(const char *node_name); +BlockDeviceInfoList *bdrv_named_nodes_list(bool flat, Error **errp); +XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp); +BlockDriverState *bdrv_lookup_bs(const char *device, + const char *node_name, + Error **errp); +bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base); +BlockDriverState *bdrv_next_node(BlockDriverState *bs); +BlockDriverState *bdrv_next_all_states(BlockDriverState *bs); + +typedef struct BdrvNextIterator { + enum { + BDRV_NEXT_BACKEND_ROOTS, + BDRV_NEXT_MONITOR_OWNED, + } phase; + BlockBackend *blk; + BlockDriverState *bs; +} BdrvNextIterator; + +BlockDriverState *bdrv_first(BdrvNextIterator *it); +BlockDriverState *bdrv_next(BdrvNextIterator *it); +void bdrv_next_cleanup(BdrvNextIterator *it); + +BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs); +void bdrv_iterate_format(void (*it)(void *opaque, const char *name), + void *opaque, bool read_only); +int bdrv_get_flags(BlockDriverState *bs); +char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp); +char *bdrv_dirname(BlockDriverState *bs, Error **errp); + +void bdrv_img_create(const char *filename, const char *fmt, + const char *base_filename, const char *base_fmt, + char *options, uint64_t img_size, int flags, + bool quiet, Error **errp); + +void bdrv_ref(BlockDriverState *bs); +void bdrv_unref(BlockDriverState *bs); +void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child); +BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, + BlockDriverState *child_bs, + const char *child_name, + const BdrvChildClass *child_class, + BdrvChildRole child_role, + Error **errp); + +bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp); +void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason); +void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason); +void bdrv_op_block_all(BlockDriverState *bs, Error *reason); +void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason); +bool bdrv_op_blocker_is_empty(BlockDriverState *bs); + +int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, + const char *tag); +int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag); +int bdrv_debug_resume(BlockDriverState *bs, const char *tag); +bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag); + +/** + * Locks the AioContext of @bs if it's not the current AioContext. This avoids + * double locking which could lead to deadlocks: This is a coroutine_fn, so we + * know we already own the lock of the current AioContext. + * + * May only be called in the main thread. + */ +void coroutine_fn bdrv_co_lock(BlockDriverState *bs); + +/** + * Unlocks the AioContext of @bs if it's not the current AioContext. + */ +void coroutine_fn bdrv_co_unlock(BlockDriverState *bs); + +void bdrv_set_aio_context_ignore(BlockDriverState *bs, + AioContext *new_context, GSList **ignore); +int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx, + Error **errp); +int bdrv_child_try_set_aio_context(BlockDriverState *bs, AioContext *ctx, + BdrvChild *ignore_child, Error **errp); +bool bdrv_child_can_set_aio_context(BdrvChild *c, AioContext *ctx, + GSList **ignore, Error **errp); +bool bdrv_can_set_aio_context(BlockDriverState *bs, AioContext *ctx, + GSList **ignore, Error **errp); +AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c); + +int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz); +int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo); + +void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child, + Error **errp); +void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp); + +/** + * + * bdrv_register_buf/bdrv_unregister_buf: + * + * Register/unregister a buffer for I/O. For example, VFIO drivers are + * interested to know the memory areas that would later be used for I/O, so + * that they can prepare IOMMU mapping etc., to get better performance. + */ +void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size); +void bdrv_unregister_buf(BlockDriverState *bs, void *host); + +void bdrv_cancel_in_flight(BlockDriverState *bs); + +#endif /* BLOCK_GLOBAL_STATE_H */ diff --git a/include/block/block-io.h b/include/block/block-io.h new file mode 100644 index 0000000..5e3f346 --- /dev/null +++ b/include/block/block-io.h @@ -0,0 +1,368 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCK_IO_H +#define BLOCK_IO_H + +#include "block-common.h" + +/* + * I/O API functions. These functions are thread-safe, and therefore + * can run in any thread as long as the thread has called + * aio_context_acquire/release(). + * + * These functions can only call functions from I/O and Common categories, + * but can be invoked by GS, "I/O or GS" and I/O APIs. + * + * All functions in this category must use the macro + * IO_CODE(); + * to catch when they are accidentally called by the wrong API. + */ + +int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, + int64_t bytes, BdrvRequestFlags flags); +int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags); +int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes); +int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, + int64_t bytes); +int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, + const void *buf, int64_t bytes); +/* + * Efficiently zero a region of the disk image. Note that this is a regular + * I/O request like read or write and should have a reasonable size. This + * function is not suitable for zeroing the entire image in a single request + * because it may allocate memory for the entire region. + */ +int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, + int64_t bytes, BdrvRequestFlags flags); + +int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + PreallocMode prealloc, BdrvRequestFlags flags, + Error **errp); + +int64_t bdrv_nb_sectors(BlockDriverState *bs); +int64_t bdrv_getlength(BlockDriverState *bs); +int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); +BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts, + BlockDriverState *in_bs, Error **errp); +void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr); +int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp); +void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs); + + +/* async block I/O */ +void bdrv_aio_cancel(BlockAIOCB *acb); +void bdrv_aio_cancel_async(BlockAIOCB *acb); + +/* sg packet commands */ +int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); + +/* Ensure contents are flushed to disk. */ +int coroutine_fn bdrv_co_flush(BlockDriverState *bs); + +int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); +bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); +int bdrv_block_status(BlockDriverState *bs, int64_t offset, + int64_t bytes, int64_t *pnum, int64_t *map, + BlockDriverState **file); +int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, + int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file); +int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, + int64_t *pnum); +int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, + bool include_base, int64_t offset, int64_t bytes, + int64_t *pnum); +int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, + int64_t bytes); + +int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only, + bool ignore_allow_rdw, Error **errp); +int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg, + Error **errp); +bool bdrv_is_read_only(BlockDriverState *bs); +bool bdrv_is_writable(BlockDriverState *bs); +bool bdrv_is_sg(BlockDriverState *bs); +bool bdrv_is_inserted(BlockDriverState *bs); +void bdrv_lock_medium(BlockDriverState *bs, bool locked); +void bdrv_eject(BlockDriverState *bs, bool eject_flag); +const char *bdrv_get_format_name(BlockDriverState *bs); + +bool bdrv_supports_compressed_writes(BlockDriverState *bs); +const char *bdrv_get_node_name(const BlockDriverState *bs); +const char *bdrv_get_device_name(const BlockDriverState *bs); +const char *bdrv_get_device_or_node_name(const BlockDriverState *bs); +int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi); +ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs, + Error **errp); +BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs); +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t offset, int64_t bytes, + int64_t *cluster_offset, + int64_t *cluster_bytes); + +void bdrv_get_backing_filename(BlockDriverState *bs, + char *filename, int filename_size); + +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size); + +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size); + +/* + * Returns the alignment in bytes that is required so that no bounce buffer + * is required throughout the stack + */ +size_t bdrv_min_mem_align(BlockDriverState *bs); +/* Returns optimal alignment in bytes for bounce buffer */ +size_t bdrv_opt_mem_align(BlockDriverState *bs); +void *qemu_blockalign(BlockDriverState *bs, size_t size); +void *qemu_blockalign0(BlockDriverState *bs, size_t size); +void *qemu_try_blockalign(BlockDriverState *bs, size_t size); +void *qemu_try_blockalign0(BlockDriverState *bs, size_t size); +bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); + +void bdrv_enable_copy_on_read(BlockDriverState *bs); +void bdrv_disable_copy_on_read(BlockDriverState *bs); + +void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event); + +#define BLKDBG_EVENT(child, evt) \ + do { \ + if (child) { \ + bdrv_debug_event(child->bs, evt); \ + } \ + } while (0) + +/** + * bdrv_get_aio_context: + * + * Returns: the currently bound #AioContext + */ +AioContext *bdrv_get_aio_context(BlockDriverState *bs); + +/** + * Move the current coroutine to the AioContext of @bs and return the old + * AioContext of the coroutine. Increase bs->in_flight so that draining @bs + * will wait for the operation to proceed until the corresponding + * bdrv_co_leave(). + * + * Consequently, you can't call drain inside a bdrv_co_enter/leave() section as + * this will deadlock. + */ +AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs); + +/** + * Ends a section started by bdrv_co_enter(). Move the current coroutine back + * to old_ctx and decrease bs->in_flight again. + */ +void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx); + +/** + * Transfer control to @co in the aio context of @bs + */ +void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co); + +AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c); + +void bdrv_io_plug(BlockDriverState *bs); +void bdrv_io_unplug(BlockDriverState *bs); + +bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, + uint32_t granularity, Error **errp); + +/** + * + * bdrv_co_copy_range: + * + * Do offloaded copy between two children. If the operation is not implemented + * by the driver, or if the backend storage doesn't support it, a negative + * error code will be returned. + * + * Note: block layer doesn't emulate or fallback to a bounce buffer approach + * because usually the caller shouldn't attempt offloaded copy any more (e.g. + * calling copy_file_range(2)) after the first error, thus it should fall back + * to a read+write path in the caller level. + * + * @src: Source child to copy data from + * @src_offset: offset in @src image to read data + * @dst: Destination child to copy data to + * @dst_offset: offset in @dst image to write data + * @bytes: number of bytes to copy + * @flags: request flags. Supported flags: + * BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero + * write on @dst as if bdrv_co_pwrite_zeroes is + * called. Used to simplify caller code, or + * during BlockDriver.bdrv_co_copy_range_from() + * recursion. + * BDRV_REQ_NO_SERIALISING - do not serialize with other overlapping + * requests currently in flight. + * + * Returns: 0 if succeeded; negative error code if failed. + **/ +int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + +/** + * bdrv_drained_end_no_poll: + * + * Same as bdrv_drained_end(), but do not poll for the subgraph to + * actually become unquiesced. Therefore, no graph changes will occur + * with this function. + * + * *drained_end_counter is incremented for every background operation + * that is scheduled, and will be decremented for every operation once + * it settles. The caller must poll until it reaches 0. The counter + * should be accessed using atomic operations only. + */ +void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter); + + +/* + * "I/O or GS" API functions. These functions can run without + * the BQL, but only in one specific iothread/main loop. + * + * More specifically, these functions use BDRV_POLL_WHILE(bs), which + * requires the caller to be either in the main thread and hold + * the BlockdriverState (bs) AioContext lock, or directly in the + * home thread that runs the bs AioContext. Calling them from + * another thread in another AioContext would cause deadlocks. + * + * Therefore, these functions are not proper I/O, because they + * can't run in *any* iothreads, but only in a specific one. + * + * These functions can call any function from I/O, Common and this + * categories, but must be invoked only by other "I/O or GS" and GS APIs. + * + * All functions in this category must use the macro + * IO_OR_GS_CODE(); + * to catch when they are accidentally called by the wrong API. + */ + +#define BDRV_POLL_WHILE(bs, cond) ({ \ + BlockDriverState *bs_ = (bs); \ + IO_OR_GS_CODE(); \ + AIO_WAIT_WHILE(bdrv_get_aio_context(bs_), \ + cond); }) + +void bdrv_drain(BlockDriverState *bs); +void coroutine_fn bdrv_co_drain(BlockDriverState *bs); + +int generated_co_wrapper +bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, + PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); + +int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix); + +/* Invalidate any cached metadata used by image formats */ +int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs, + Error **errp); +int generated_co_wrapper bdrv_flush(BlockDriverState *bs); +int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset, + int64_t bytes); +int generated_co_wrapper +bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); +int generated_co_wrapper +bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); + +/** + * bdrv_parent_drained_begin_single: + * + * Begin a quiesced section for the parent of @c. If @poll is true, wait for + * any pending activity to cease. + */ +void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll); + +/** + * bdrv_parent_drained_end_single: + * + * End a quiesced section for the parent of @c. + * + * This polls @bs's AioContext until all scheduled sub-drained_ends + * have settled, which may result in graph changes. + */ +void bdrv_parent_drained_end_single(BdrvChild *c); + +/** + * bdrv_drain_poll: + * + * Poll for pending requests in @bs, its parents (except for @ignore_parent), + * and if @recursive is true its children as well (used for subtree drain). + * + * If @ignore_bds_parents is true, parents that are BlockDriverStates must + * ignore the drain request because they will be drained separately (used for + * drain_all). + * + * This is part of bdrv_drained_begin. + */ +bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, + BdrvChild *ignore_parent, bool ignore_bds_parents); + +/** + * bdrv_drained_begin: + * + * Begin a quiesced section for exclusive access to the BDS, by disabling + * external request sources including NBD server, block jobs, and device model. + * + * This function can be recursive. + */ +void bdrv_drained_begin(BlockDriverState *bs); + +/** + * bdrv_do_drained_begin_quiesce: + * + * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already + * running requests to complete. + */ +void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, + BdrvChild *parent, bool ignore_bds_parents); + +/** + * Like bdrv_drained_begin, but recursively begins a quiesced section for + * exclusive access to all child nodes as well. + */ +void bdrv_subtree_drained_begin(BlockDriverState *bs); + +/** + * bdrv_drained_end: + * + * End a quiescent section started by bdrv_drained_begin(). + * + * This polls @bs's AioContext until all scheduled sub-drained_ends + * have settled. On one hand, that may result in graph changes. On + * the other, this requires that the caller either runs in the main + * loop; or that all involved nodes (@bs and all of its parents) are + * in the caller's AioContext. + */ +void bdrv_drained_end(BlockDriverState *bs); + +/** + * End a quiescent section started by bdrv_subtree_drained_begin(). + */ +void bdrv_subtree_drained_end(BlockDriverState *bs); + +#endif /* BLOCK_IO_H */ diff --git a/include/block/block.h b/include/block/block.h index e1713ee..1e6b8fe 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -1,864 +1,32 @@ -#ifndef BLOCK_H -#define BLOCK_H - -#include "block/aio.h" -#include "block/aio-wait.h" -#include "qemu/iov.h" -#include "qemu/coroutine.h" -#include "block/accounting.h" -#include "block/dirty-bitmap.h" -#include "block/blockjob.h" -#include "qemu/hbitmap.h" -#include "qemu/transactions.h" - /* - * generated_co_wrapper - * - * Function specifier, which does nothing but mark functions to be - * generated by scripts/block-coroutine-wrapper.py - * - * Read more in docs/devel/block-coroutine-wrapper.rst - */ -#define generated_co_wrapper - -/* block.c */ -typedef struct BlockDriver BlockDriver; -typedef struct BdrvChild BdrvChild; -typedef struct BdrvChildClass BdrvChildClass; - -typedef struct BlockDriverInfo { - /* in bytes, 0 if irrelevant */ - int cluster_size; - /* offset at which the VM state can be saved (0 if not possible) */ - int64_t vm_state_offset; - bool is_dirty; - /* - * True if this block driver only supports compressed writes - */ - bool needs_compressed_writes; -} BlockDriverInfo; - -typedef struct BlockFragInfo { - uint64_t allocated_clusters; - uint64_t total_clusters; - uint64_t fragmented_clusters; - uint64_t compressed_clusters; -} BlockFragInfo; - -typedef enum { - BDRV_REQ_COPY_ON_READ = 0x1, - BDRV_REQ_ZERO_WRITE = 0x2, - - /* - * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate - * that the block driver should unmap (discard) blocks if it is guaranteed - * that the result will read back as zeroes. The flag is only passed to the - * driver if the block device is opened with BDRV_O_UNMAP. - */ - BDRV_REQ_MAY_UNMAP = 0x4, - - BDRV_REQ_FUA = 0x10, - BDRV_REQ_WRITE_COMPRESSED = 0x20, - - /* Signifies that this write request will not change the visible disk - * content. */ - BDRV_REQ_WRITE_UNCHANGED = 0x40, - - /* Forces request serialisation. Use only with write requests. */ - BDRV_REQ_SERIALISING = 0x80, - - /* Execute the request only if the operation can be offloaded or otherwise - * be executed efficiently, but return an error instead of using a slow - * fallback. */ - BDRV_REQ_NO_FALLBACK = 0x100, - - /* - * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read - * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR - * filter is involved), in which case it signals that the COR operation - * need not read the data into memory (qiov) but only ensure they are - * copied to the top layer (i.e., that COR operation is done). - */ - BDRV_REQ_PREFETCH = 0x200, - - /* - * If we need to wait for other requests, just fail immediately. Used - * only together with BDRV_REQ_SERIALISING. - */ - BDRV_REQ_NO_WAIT = 0x400, - - /* Mask of valid flags */ - BDRV_REQ_MASK = 0x7ff, -} BdrvRequestFlags; - -typedef struct BlockSizes { - uint32_t phys; - uint32_t log; -} BlockSizes; - -typedef struct HDGeometry { - uint32_t heads; - uint32_t sectors; - uint32_t cylinders; -} HDGeometry; - -#define BDRV_O_NO_SHARE 0x0001 /* don't share permissions */ -#define BDRV_O_RDWR 0x0002 -#define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */ -#define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save writes in a snapshot */ -#define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */ -#define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */ -#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */ -#define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */ -#define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */ -#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */ -#define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */ -#define BDRV_O_CHECK 0x1000 /* open solely for consistency check */ -#define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */ -#define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */ -#define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given: - select an appropriate protocol driver, - ignoring the format layer */ -#define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */ -#define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening read-write fails */ -#define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */ - -#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) - - -/* Option names of options parsed by the block layer */ - -#define BDRV_OPT_CACHE_WB "cache.writeback" -#define BDRV_OPT_CACHE_DIRECT "cache.direct" -#define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush" -#define BDRV_OPT_READ_ONLY "read-only" -#define BDRV_OPT_AUTO_READ_ONLY "auto-read-only" -#define BDRV_OPT_DISCARD "discard" -#define BDRV_OPT_FORCE_SHARE "force-share" - - -#define BDRV_SECTOR_BITS 9 -#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) - -#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ - INT_MAX >> BDRV_SECTOR_BITS) -#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) - -/* - * We want allow aligning requests and disk length up to any 32bit alignment - * and don't afraid of overflow. - * To achieve it, and in the same time use some pretty number as maximum disk - * size, let's define maximum "length" (a limit for any offset/bytes request and - * for disk size) to be the greatest power of 2 less than INT64_MAX. - */ -#define BDRV_MAX_ALIGNMENT (1L << 30) -#define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT)) - -/* - * Allocation status flags for bdrv_block_status() and friends. - * - * Public flags: - * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer - * BDRV_BLOCK_ZERO: offset reads as zero - * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data - * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this - * layer rather than any backing, set by block layer - * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this - * layer, set by block layer - * - * Internal flags: - * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request - * that the block layer recompute the answer from the returned - * BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID. - * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for - * zeroes in file child of current block node inside - * returned region. Only valid together with both - * BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not - * appear with BDRV_BLOCK_ZERO. - * - * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the - * host offset within the returned BDS that is allocated for the - * corresponding raw guest data. However, whether that offset - * actually contains data also depends on BDRV_BLOCK_DATA, as follows: - * - * DATA ZERO OFFSET_VALID - * t t t sectors read as zero, returned file is zero at offset - * t f t sectors read as valid from file at offset - * f t t sectors preallocated, read as zero, returned file not - * necessarily zero at offset - * f f t sectors preallocated but read from backing_hd, - * returned file contains garbage at offset - * t t f sectors preallocated, read as zero, unknown offset - * t f f sectors read from unknown file or offset - * f t f not allocated or unknown offset, read as zero - * f f f not allocated or unknown offset, read from backing_hd - */ -#define BDRV_BLOCK_DATA 0x01 -#define BDRV_BLOCK_ZERO 0x02 -#define BDRV_BLOCK_OFFSET_VALID 0x04 -#define BDRV_BLOCK_RAW 0x08 -#define BDRV_BLOCK_ALLOCATED 0x10 -#define BDRV_BLOCK_EOF 0x20 -#define BDRV_BLOCK_RECURSE 0x40 - -typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue; - -typedef struct BDRVReopenState { - BlockDriverState *bs; - int flags; - BlockdevDetectZeroesOptions detect_zeroes; - bool backing_missing; - BlockDriverState *old_backing_bs; /* keep pointer for permissions update */ - BlockDriverState *old_file_bs; /* keep pointer for permissions update */ - QDict *options; - QDict *explicit_options; - void *opaque; -} BDRVReopenState; - -/* - * Block operation types - */ -typedef enum BlockOpType { - BLOCK_OP_TYPE_BACKUP_SOURCE, - BLOCK_OP_TYPE_BACKUP_TARGET, - BLOCK_OP_TYPE_CHANGE, - BLOCK_OP_TYPE_COMMIT_SOURCE, - BLOCK_OP_TYPE_COMMIT_TARGET, - BLOCK_OP_TYPE_DATAPLANE, - BLOCK_OP_TYPE_DRIVE_DEL, - BLOCK_OP_TYPE_EJECT, - BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT, - BLOCK_OP_TYPE_INTERNAL_SNAPSHOT, - BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE, - BLOCK_OP_TYPE_MIRROR_SOURCE, - BLOCK_OP_TYPE_MIRROR_TARGET, - BLOCK_OP_TYPE_RESIZE, - BLOCK_OP_TYPE_STREAM, - BLOCK_OP_TYPE_REPLACE, - BLOCK_OP_TYPE_MAX, -} BlockOpType; - -/* Block node permission constants */ -enum { - /** - * A user that has the "permission" of consistent reads is guaranteed that - * their view of the contents of the block device is complete and - * self-consistent, representing the contents of a disk at a specific - * point. - * - * For most block devices (including their backing files) this is true, but - * the property cannot be maintained in a few situations like for - * intermediate nodes of a commit block job. - */ - BLK_PERM_CONSISTENT_READ = 0x01, - - /** This permission is required to change the visible disk contents. */ - BLK_PERM_WRITE = 0x02, - - /** - * This permission (which is weaker than BLK_PERM_WRITE) is both enough and - * required for writes to the block node when the caller promises that - * the visible disk content doesn't change. - * - * As the BLK_PERM_WRITE permission is strictly stronger, either is - * sufficient to perform an unchanging write. - */ - BLK_PERM_WRITE_UNCHANGED = 0x04, - - /** This permission is required to change the size of a block node. */ - BLK_PERM_RESIZE = 0x08, - - /** - * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU - * 6.1 and earlier may still lock the corresponding byte in block/file-posix - * locking. So, implementing some new permission should be very careful to - * not interfere with this old unused thing. - */ - - BLK_PERM_ALL = 0x0f, - - DEFAULT_PERM_PASSTHROUGH = BLK_PERM_CONSISTENT_READ - | BLK_PERM_WRITE - | BLK_PERM_WRITE_UNCHANGED - | BLK_PERM_RESIZE, - - DEFAULT_PERM_UNCHANGED = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH, -}; - -/* - * Flags that parent nodes assign to child nodes to specify what kind of - * role(s) they take. - * - * At least one of DATA, METADATA, FILTERED, or COW must be set for - * every child. - */ -enum BdrvChildRoleBits { - /* - * This child stores data. - * Any node may have an arbitrary number of such children. - */ - BDRV_CHILD_DATA = (1 << 0), - - /* - * This child stores metadata. - * Any node may have an arbitrary number of metadata-storing - * children. - */ - BDRV_CHILD_METADATA = (1 << 1), - - /* - * A child that always presents exactly the same visible data as - * the parent, e.g. by virtue of the parent forwarding all reads - * and writes. - * This flag is mutually exclusive with DATA, METADATA, and COW. - * Any node may have at most one filtered child at a time. - */ - BDRV_CHILD_FILTERED = (1 << 2), - - /* - * Child from which to read all data that isn't allocated in the - * parent (i.e., the backing child); such data is copied to the - * parent through COW (and optionally COR). - * This field is mutually exclusive with DATA, METADATA, and - * FILTERED. - * Any node may have at most one such backing child at a time. - */ - BDRV_CHILD_COW = (1 << 3), - - /* - * The primary child. For most drivers, this is the child whose - * filename applies best to the parent node. - * Any node may have at most one primary child at a time. - */ - BDRV_CHILD_PRIMARY = (1 << 4), - - /* Useful combination of flags */ - BDRV_CHILD_IMAGE = BDRV_CHILD_DATA - | BDRV_CHILD_METADATA - | BDRV_CHILD_PRIMARY, -}; - -/* Mask of BdrvChildRoleBits values */ -typedef unsigned int BdrvChildRole; - -char *bdrv_perm_names(uint64_t perm); -uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm); - -void bdrv_init(void); -void bdrv_init_with_whitelist(void); -bool bdrv_uses_whitelist(void); -int bdrv_is_whitelisted(BlockDriver *drv, bool read_only); -BlockDriver *bdrv_find_protocol(const char *filename, - bool allow_protocol_prefix, - Error **errp); -BlockDriver *bdrv_find_format(const char *format_name); -int bdrv_create(BlockDriver *drv, const char* filename, - QemuOpts *opts, Error **errp); -int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp); - -BlockDriverState *bdrv_new(void); -int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, - Error **errp); -int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, - Error **errp); -int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs, - Error **errp); -BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, - int flags, Error **errp); -int bdrv_drop_filter(BlockDriverState *bs, Error **errp); - -int bdrv_parse_aio(const char *mode, int *flags); -int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); -int bdrv_parse_discard_flags(const char *mode, int *flags); -BdrvChild *bdrv_open_child(const char *filename, - QDict *options, const char *bdref_key, - BlockDriverState* parent, - const BdrvChildClass *child_class, - BdrvChildRole child_role, - bool allow_none, Error **errp); -BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp); -int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, - Error **errp); -int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, - const char *bdref_key, Error **errp); -BlockDriverState *bdrv_open(const char *filename, const char *reference, - QDict *options, int flags, Error **errp); -BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv, - const char *node_name, - QDict *options, int flags, - Error **errp); -BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name, - int flags, Error **errp); -BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, - BlockDriverState *bs, QDict *options, - bool keep_old_opts); -void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue); -int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp); -int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts, - Error **errp); -int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, - Error **errp); -int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, - int64_t bytes, BdrvRequestFlags flags); -int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags); -int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes); -int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, - int64_t bytes); -int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, - const void *buf, int64_t bytes); -/* - * Efficiently zero a region of the disk image. Note that this is a regular - * I/O request like read or write and should have a reasonable size. This - * function is not suitable for zeroing the entire image in a single request - * because it may allocate memory for the entire region. - */ -int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, - int64_t bytes, BdrvRequestFlags flags); -BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, - const char *backing_file); -void bdrv_refresh_filename(BlockDriverState *bs); - -int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, - PreallocMode prealloc, BdrvRequestFlags flags, - Error **errp); -int generated_co_wrapper -bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, - PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); - -int64_t bdrv_nb_sectors(BlockDriverState *bs); -int64_t bdrv_getlength(BlockDriverState *bs); -int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); -BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts, - BlockDriverState *in_bs, Error **errp); -void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr); -void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp); -int bdrv_commit(BlockDriverState *bs); -int bdrv_make_empty(BdrvChild *c, Error **errp); -int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file, - const char *backing_fmt, bool warn); -void bdrv_register(BlockDriver *bdrv); -int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, - const char *backing_file_str); -BlockDriverState *bdrv_find_overlay(BlockDriverState *active, - BlockDriverState *bs); -BlockDriverState *bdrv_find_base(BlockDriverState *bs); -bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base, - Error **errp); -int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base, - Error **errp); -void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base); -int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp); -void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs); - - -typedef struct BdrvCheckResult { - int corruptions; - int leaks; - int check_errors; - int corruptions_fixed; - int leaks_fixed; - int64_t image_end_offset; - BlockFragInfo bfi; -} BdrvCheckResult; - -typedef enum { - BDRV_FIX_LEAKS = 1, - BDRV_FIX_ERRORS = 2, -} BdrvCheckMode; - -int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix); - -/* The units of offset and total_work_size may be chosen arbitrarily by the - * block driver; total_work_size may change during the course of the amendment - * operation */ -typedef void BlockDriverAmendStatusCB(BlockDriverState *bs, int64_t offset, - int64_t total_work_size, void *opaque); -int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts, - BlockDriverAmendStatusCB *status_cb, void *cb_opaque, - bool force, - Error **errp); - -/* check if a named node can be replaced when doing drive-mirror */ -BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, - const char *node_name, Error **errp); - -/* async block I/O */ -void bdrv_aio_cancel(BlockAIOCB *acb); -void bdrv_aio_cancel_async(BlockAIOCB *acb); - -/* sg packet commands */ -int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); - -/* Invalidate any cached metadata used by image formats */ -int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs, - Error **errp); -void bdrv_invalidate_cache_all(Error **errp); -int bdrv_inactivate_all(void); - -/* Ensure contents are flushed to disk. */ -int generated_co_wrapper bdrv_flush(BlockDriverState *bs); -int coroutine_fn bdrv_co_flush(BlockDriverState *bs); -int bdrv_flush_all(void); -void bdrv_close_all(void); -void bdrv_drain(BlockDriverState *bs); -void coroutine_fn bdrv_co_drain(BlockDriverState *bs); -void bdrv_drain_all_begin(void); -void bdrv_drain_all_end(void); -void bdrv_drain_all(void); - -#define BDRV_POLL_WHILE(bs, cond) ({ \ - BlockDriverState *bs_ = (bs); \ - AIO_WAIT_WHILE(bdrv_get_aio_context(bs_), \ - cond); }) - -int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset, - int64_t bytes); -int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); -int bdrv_has_zero_init_1(BlockDriverState *bs); -int bdrv_has_zero_init(BlockDriverState *bs); -bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); -int bdrv_block_status(BlockDriverState *bs, int64_t offset, - int64_t bytes, int64_t *pnum, int64_t *map, - BlockDriverState **file); -int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, - int64_t offset, int64_t bytes, int64_t *pnum, - int64_t *map, BlockDriverState **file); -int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, - int64_t *pnum); -int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, - bool include_base, int64_t offset, int64_t bytes, - int64_t *pnum); -int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, - int64_t bytes); - -bool bdrv_is_read_only(BlockDriverState *bs); -int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only, - bool ignore_allow_rdw, Error **errp); -int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg, - Error **errp); -bool bdrv_is_writable(BlockDriverState *bs); -bool bdrv_is_sg(BlockDriverState *bs); -bool bdrv_is_inserted(BlockDriverState *bs); -void bdrv_lock_medium(BlockDriverState *bs, bool locked); -void bdrv_eject(BlockDriverState *bs, bool eject_flag); -const char *bdrv_get_format_name(BlockDriverState *bs); -BlockDriverState *bdrv_find_node(const char *node_name); -BlockDeviceInfoList *bdrv_named_nodes_list(bool flat, Error **errp); -XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp); -BlockDriverState *bdrv_lookup_bs(const char *device, - const char *node_name, - Error **errp); -bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base); -BlockDriverState *bdrv_next_node(BlockDriverState *bs); -BlockDriverState *bdrv_next_all_states(BlockDriverState *bs); - -typedef struct BdrvNextIterator { - enum { - BDRV_NEXT_BACKEND_ROOTS, - BDRV_NEXT_MONITOR_OWNED, - } phase; - BlockBackend *blk; - BlockDriverState *bs; -} BdrvNextIterator; - -BlockDriverState *bdrv_first(BdrvNextIterator *it); -BlockDriverState *bdrv_next(BdrvNextIterator *it); -void bdrv_next_cleanup(BdrvNextIterator *it); - -BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs); -bool bdrv_supports_compressed_writes(BlockDriverState *bs); -void bdrv_iterate_format(void (*it)(void *opaque, const char *name), - void *opaque, bool read_only); -const char *bdrv_get_node_name(const BlockDriverState *bs); -const char *bdrv_get_device_name(const BlockDriverState *bs); -const char *bdrv_get_device_or_node_name(const BlockDriverState *bs); -int bdrv_get_flags(BlockDriverState *bs); -int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi); -ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs, - Error **errp); -BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs); -void bdrv_round_to_clusters(BlockDriverState *bs, - int64_t offset, int64_t bytes, - int64_t *cluster_offset, - int64_t *cluster_bytes); - -void bdrv_get_backing_filename(BlockDriverState *bs, - char *filename, int filename_size); -char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp); -char *bdrv_get_full_backing_filename_from_filename(const char *backed, - const char *backing, - Error **errp); -char *bdrv_dirname(BlockDriverState *bs, Error **errp); - -int path_has_protocol(const char *path); -int path_is_absolute(const char *path); -char *path_combine(const char *base_path, const char *filename); - -int generated_co_wrapper -bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); -int generated_co_wrapper -bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); -int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, - int64_t pos, int size); - -int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size); - -void bdrv_img_create(const char *filename, const char *fmt, - const char *base_filename, const char *base_fmt, - char *options, uint64_t img_size, int flags, - bool quiet, Error **errp); - -/* Returns the alignment in bytes that is required so that no bounce buffer - * is required throughout the stack */ -size_t bdrv_min_mem_align(BlockDriverState *bs); -/* Returns optimal alignment in bytes for bounce buffer */ -size_t bdrv_opt_mem_align(BlockDriverState *bs); -void *qemu_blockalign(BlockDriverState *bs, size_t size); -void *qemu_blockalign0(BlockDriverState *bs, size_t size); -void *qemu_try_blockalign(BlockDriverState *bs, size_t size); -void *qemu_try_blockalign0(BlockDriverState *bs, size_t size); -bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); - -void bdrv_enable_copy_on_read(BlockDriverState *bs); -void bdrv_disable_copy_on_read(BlockDriverState *bs); - -void bdrv_ref(BlockDriverState *bs); -void bdrv_unref(BlockDriverState *bs); -void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child); -BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, - BlockDriverState *child_bs, - const char *child_name, - const BdrvChildClass *child_class, - BdrvChildRole child_role, - Error **errp); - -bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp); -void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason); -void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason); -void bdrv_op_block_all(BlockDriverState *bs, Error *reason); -void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason); -bool bdrv_op_blocker_is_empty(BlockDriverState *bs); - -#define BLKDBG_EVENT(child, evt) \ - do { \ - if (child) { \ - bdrv_debug_event(child->bs, evt); \ - } \ - } while (0) - -void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event); - -int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, - const char *tag); -int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag); -int bdrv_debug_resume(BlockDriverState *bs, const char *tag); -bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag); - -/** - * bdrv_get_aio_context: + * QEMU System Emulator block driver * - * Returns: the currently bound #AioContext - */ -AioContext *bdrv_get_aio_context(BlockDriverState *bs); - -/** - * Move the current coroutine to the AioContext of @bs and return the old - * AioContext of the coroutine. Increase bs->in_flight so that draining @bs - * will wait for the operation to proceed until the corresponding - * bdrv_co_leave(). + * Copyright (c) 2003 Fabrice Bellard * - * Consequently, you can't call drain inside a bdrv_co_enter/leave() section as - * this will deadlock. - */ -AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs); - -/** - * Ends a section started by bdrv_co_enter(). Move the current coroutine back - * to old_ctx and decrease bs->in_flight again. - */ -void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx); - -/** - * Locks the AioContext of @bs if it's not the current AioContext. This avoids - * double locking which could lead to deadlocks: This is a coroutine_fn, so we - * know we already own the lock of the current AioContext. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: * - * May only be called in the main thread. - */ -void coroutine_fn bdrv_co_lock(BlockDriverState *bs); - -/** - * Unlocks the AioContext of @bs if it's not the current AioContext. - */ -void coroutine_fn bdrv_co_unlock(BlockDriverState *bs); - -/** - * Transfer control to @co in the aio context of @bs - */ -void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co); - -void bdrv_set_aio_context_ignore(BlockDriverState *bs, - AioContext *new_context, GSList **ignore); -int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx, - Error **errp); -int bdrv_child_try_set_aio_context(BlockDriverState *bs, AioContext *ctx, - BdrvChild *ignore_child, Error **errp); -bool bdrv_child_can_set_aio_context(BdrvChild *c, AioContext *ctx, - GSList **ignore, Error **errp); -bool bdrv_can_set_aio_context(BlockDriverState *bs, AioContext *ctx, - GSList **ignore, Error **errp); -AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c); -AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c); - -int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz); -int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo); - -void bdrv_io_plug(BlockDriverState *bs); -void bdrv_io_unplug(BlockDriverState *bs); - -/** - * bdrv_parent_drained_begin_single: + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * - * Begin a quiesced section for the parent of @c. If @poll is true, wait for - * any pending activity to cease. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ -void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll); - -/** - * bdrv_parent_drained_end_single: - * - * End a quiesced section for the parent of @c. - * - * This polls @bs's AioContext until all scheduled sub-drained_ends - * have settled, which may result in graph changes. - */ -void bdrv_parent_drained_end_single(BdrvChild *c); - -/** - * bdrv_drain_poll: - * - * Poll for pending requests in @bs, its parents (except for @ignore_parent), - * and if @recursive is true its children as well (used for subtree drain). - * - * If @ignore_bds_parents is true, parents that are BlockDriverStates must - * ignore the drain request because they will be drained separately (used for - * drain_all). - * - * This is part of bdrv_drained_begin. - */ -bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, - BdrvChild *ignore_parent, bool ignore_bds_parents); - -/** - * bdrv_drained_begin: - * - * Begin a quiesced section for exclusive access to the BDS, by disabling - * external request sources including NBD server, block jobs, and device model. - * - * This function can be recursive. - */ -void bdrv_drained_begin(BlockDriverState *bs); - -/** - * bdrv_do_drained_begin_quiesce: - * - * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already - * running requests to complete. - */ -void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, - BdrvChild *parent, bool ignore_bds_parents); - -/** - * Like bdrv_drained_begin, but recursively begins a quiesced section for - * exclusive access to all child nodes as well. - */ -void bdrv_subtree_drained_begin(BlockDriverState *bs); - -/** - * bdrv_drained_end: - * - * End a quiescent section started by bdrv_drained_begin(). - * - * This polls @bs's AioContext until all scheduled sub-drained_ends - * have settled. On one hand, that may result in graph changes. On - * the other, this requires that the caller either runs in the main - * loop; or that all involved nodes (@bs and all of its parents) are - * in the caller's AioContext. - */ -void bdrv_drained_end(BlockDriverState *bs); - -/** - * bdrv_drained_end_no_poll: - * - * Same as bdrv_drained_end(), but do not poll for the subgraph to - * actually become unquiesced. Therefore, no graph changes will occur - * with this function. - * - * *drained_end_counter is incremented for every background operation - * that is scheduled, and will be decremented for every operation once - * it settles. The caller must poll until it reaches 0. The counter - * should be accessed using atomic operations only. - */ -void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter); - -/** - * End a quiescent section started by bdrv_subtree_drained_begin(). - */ -void bdrv_subtree_drained_end(BlockDriverState *bs); - -void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child, - Error **errp); -void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp); - -bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, - uint32_t granularity, Error **errp); -/** - * - * bdrv_register_buf/bdrv_unregister_buf: - * - * Register/unregister a buffer for I/O. For example, VFIO drivers are - * interested to know the memory areas that would later be used for I/O, so - * that they can prepare IOMMU mapping etc., to get better performance. - */ -void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size); -void bdrv_unregister_buf(BlockDriverState *bs, void *host); +#ifndef BLOCK_H +#define BLOCK_H -/** - * - * bdrv_co_copy_range: - * - * Do offloaded copy between two children. If the operation is not implemented - * by the driver, or if the backend storage doesn't support it, a negative - * error code will be returned. - * - * Note: block layer doesn't emulate or fallback to a bounce buffer approach - * because usually the caller shouldn't attempt offloaded copy any more (e.g. - * calling copy_file_range(2)) after the first error, thus it should fall back - * to a read+write path in the caller level. - * - * @src: Source child to copy data from - * @src_offset: offset in @src image to read data - * @dst: Destination child to copy data to - * @dst_offset: offset in @dst image to write data - * @bytes: number of bytes to copy - * @flags: request flags. Supported flags: - * BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero - * write on @dst as if bdrv_co_pwrite_zeroes is - * called. Used to simplify caller code, or - * during BlockDriver.bdrv_co_copy_range_from() - * recursion. - * BDRV_REQ_NO_SERIALISING - do not serialize with other overlapping - * requests currently in flight. - * - * Returns: 0 if succeeded; negative error code if failed. - **/ -int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, - BdrvChild *dst, int64_t dst_offset, - int64_t bytes, BdrvRequestFlags read_flags, - BdrvRequestFlags write_flags); +#include "block-global-state.h" +#include "block-io.h" -void bdrv_cancel_in_flight(BlockDriverState *bs); +/* DO NOT ADD ANYTHING IN HERE. USE ONE OF THE HEADERS INCLUDED ABOVE */ -#endif +#endif /* BLOCK_H */ diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h new file mode 100644 index 0000000..8947aba --- /dev/null +++ b/include/block/block_int-common.h @@ -0,0 +1,1246 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCK_INT_COMMON_H +#define BLOCK_INT_COMMON_H + +#include "block/accounting.h" +#include "block/block.h" +#include "block/aio-wait.h" +#include "qemu/queue.h" +#include "qemu/coroutine.h" +#include "qemu/stats64.h" +#include "qemu/timer.h" +#include "qemu/hbitmap.h" +#include "block/snapshot.h" +#include "qemu/throttle.h" +#include "qemu/rcu.h" + +#define BLOCK_FLAG_LAZY_REFCOUNTS 8 + +#define BLOCK_OPT_SIZE "size" +#define BLOCK_OPT_ENCRYPT "encryption" +#define BLOCK_OPT_ENCRYPT_FORMAT "encrypt.format" +#define BLOCK_OPT_COMPAT6 "compat6" +#define BLOCK_OPT_HWVERSION "hwversion" +#define BLOCK_OPT_BACKING_FILE "backing_file" +#define BLOCK_OPT_BACKING_FMT "backing_fmt" +#define BLOCK_OPT_CLUSTER_SIZE "cluster_size" +#define BLOCK_OPT_TABLE_SIZE "table_size" +#define BLOCK_OPT_PREALLOC "preallocation" +#define BLOCK_OPT_SUBFMT "subformat" +#define BLOCK_OPT_COMPAT_LEVEL "compat" +#define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts" +#define BLOCK_OPT_ADAPTER_TYPE "adapter_type" +#define BLOCK_OPT_REDUNDANCY "redundancy" +#define BLOCK_OPT_NOCOW "nocow" +#define BLOCK_OPT_EXTENT_SIZE_HINT "extent_size_hint" +#define BLOCK_OPT_OBJECT_SIZE "object_size" +#define BLOCK_OPT_REFCOUNT_BITS "refcount_bits" +#define BLOCK_OPT_DATA_FILE "data_file" +#define BLOCK_OPT_DATA_FILE_RAW "data_file_raw" +#define BLOCK_OPT_COMPRESSION_TYPE "compression_type" +#define BLOCK_OPT_EXTL2 "extended_l2" + +#define BLOCK_PROBE_BUF_SIZE 512 + +enum BdrvTrackedRequestType { + BDRV_TRACKED_READ, + BDRV_TRACKED_WRITE, + BDRV_TRACKED_DISCARD, + BDRV_TRACKED_TRUNCATE, +}; + +/* + * That is not quite good that BdrvTrackedRequest structure is public, + * as block/io.c is very careful about incoming offset/bytes being + * correct. Be sure to assert bdrv_check_request() succeeded after any + * modification of BdrvTrackedRequest object out of block/io.c + */ +typedef struct BdrvTrackedRequest { + BlockDriverState *bs; + int64_t offset; + int64_t bytes; + enum BdrvTrackedRequestType type; + + bool serialising; + int64_t overlap_offset; + int64_t overlap_bytes; + + QLIST_ENTRY(BdrvTrackedRequest) list; + Coroutine *co; /* owner, used for deadlock detection */ + CoQueue wait_queue; /* coroutines blocked on this request */ + + struct BdrvTrackedRequest *waiting_for; +} BdrvTrackedRequest; + + +struct BlockDriver { + /* + * These fields are initialized when this object is created, + * and are never changed afterwards. + */ + + const char *format_name; + int instance_size; + + /* + * Set to true if the BlockDriver is a block filter. Block filters pass + * certain callbacks that refer to data (see block.c) to their bs->file + * or bs->backing (whichever one exists) if the driver doesn't implement + * them. Drivers that do not wish to forward must implement them and return + * -ENOTSUP. + * Note that filters are not allowed to modify data. + * + * Filters generally cannot have more than a single filtered child, + * because the data they present must at all times be the same as + * that on their filtered child. That would be impossible to + * achieve for multiple filtered children. + * (And this filtered child must then be bs->file or bs->backing.) + */ + bool is_filter; + /* + * Set to true if the BlockDriver is a format driver. Format nodes + * generally do not expect their children to be other format nodes + * (except for backing files), and so format probing is disabled + * on those children. + */ + bool is_format; + + /* + * Drivers not implementing bdrv_parse_filename nor bdrv_open should have + * this field set to true, except ones that are defined only by their + * child's bs. + * An example of the last type will be the quorum block driver. + */ + bool bdrv_needs_filename; + + /* + * Set if a driver can support backing files. This also implies the + * following semantics: + * + * - Return status 0 of .bdrv_co_block_status means that corresponding + * blocks are not allocated in this layer of backing-chain + * - For such (unallocated) blocks, read will: + * - fill buffer with zeros if there is no backing file + * - read from the backing file otherwise, where the block layer + * takes care of reading zeros beyond EOF if backing file is short + */ + bool supports_backing; + + bool has_variable_length; + + /* + * Drivers setting this field must be able to work with just a plain + * filename with '<protocol_name>:' as a prefix, and no other options. + * Options may be extracted from the filename by implementing + * bdrv_parse_filename. + */ + const char *protocol_name; + + /* List of options for creating images, terminated by name == NULL */ + QemuOptsList *create_opts; + + /* List of options for image amend */ + QemuOptsList *amend_opts; + + /* + * If this driver supports reopening images this contains a + * NULL-terminated list of the runtime options that can be + * modified. If an option in this list is unspecified during + * reopen then it _must_ be reset to its default value or return + * an error. + */ + const char *const *mutable_opts; + + /* + * Pointer to a NULL-terminated array of names of strong options + * that can be specified for bdrv_open(). A strong option is one + * that changes the data of a BDS. + * If this pointer is NULL, the array is considered empty. + * "filename" and "driver" are always considered strong. + */ + const char *const *strong_runtime_opts; + + + /* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + + /* + * This function is invoked under BQL before .bdrv_co_amend() + * (which in contrast does not necessarily run under the BQL) + * to allow driver-specific initialization code that requires + * the BQL, like setting up specific permission flags. + */ + int (*bdrv_amend_pre_run)(BlockDriverState *bs, Error **errp); + /* + * This function is invoked under BQL after .bdrv_co_amend() + * to allow cleaning up what was done in .bdrv_amend_pre_run(). + */ + void (*bdrv_amend_clean)(BlockDriverState *bs); + + /* + * Return true if @to_replace can be replaced by a BDS with the + * same data as @bs without it affecting @bs's behavior (that is, + * without it being visible to @bs's parents). + */ + bool (*bdrv_recurse_can_replace)(BlockDriverState *bs, + BlockDriverState *to_replace); + + int (*bdrv_probe_device)(const char *filename); + + /* + * Any driver implementing this callback is expected to be able to handle + * NULL file names in its .bdrv_open() implementation. + */ + void (*bdrv_parse_filename)(const char *filename, QDict *options, + Error **errp); + + /* For handling image reopen for split or non-split files. */ + int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp); + void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state); + void (*bdrv_reopen_commit_post)(BDRVReopenState *reopen_state); + void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state); + void (*bdrv_join_options)(QDict *options, QDict *old_options); + + int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags, + Error **errp); + + /* Protocol drivers should implement this instead of bdrv_open */ + int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags, + Error **errp); + void (*bdrv_close)(BlockDriverState *bs); + + int coroutine_fn (*bdrv_co_create)(BlockdevCreateOptions *opts, + Error **errp); + int coroutine_fn (*bdrv_co_create_opts)(BlockDriver *drv, + const char *filename, + QemuOpts *opts, + Error **errp); + + int (*bdrv_amend_options)(BlockDriverState *bs, + QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb, + void *cb_opaque, + bool force, + Error **errp); + + int (*bdrv_make_empty)(BlockDriverState *bs); + + /* + * Refreshes the bs->exact_filename field. If that is impossible, + * bs->exact_filename has to be left empty. + */ + void (*bdrv_refresh_filename)(BlockDriverState *bs); + + /* + * Gathers the open options for all children into @target. + * A simple format driver (without backing file support) might + * implement this function like this: + * + * QINCREF(bs->file->bs->full_open_options); + * qdict_put(target, "file", bs->file->bs->full_open_options); + * + * If not specified, the generic implementation will simply put + * all children's options under their respective name. + * + * @backing_overridden is true when bs->backing seems not to be + * the child that would result from opening bs->backing_file. + * Therefore, if it is true, the backing child's options should be + * gathered; otherwise, there is no need since the backing child + * is the one implied by the image header. + * + * Note that ideally this function would not be needed. Every + * block driver which implements it is probably doing something + * shady regarding its runtime option structure. + */ + void (*bdrv_gather_child_options)(BlockDriverState *bs, QDict *target, + bool backing_overridden); + + /* + * Returns an allocated string which is the directory name of this BDS: It + * will be used to make relative filenames absolute by prepending this + * function's return value to them. + */ + char *(*bdrv_dirname)(BlockDriverState *bs, Error **errp); + + /* + * This informs the driver that we are no longer interested in the result + * of in-flight requests, so don't waste the time if possible. + * + * One example usage is to avoid waiting for an nbd target node reconnect + * timeout during job-cancel with force=true. + */ + void (*bdrv_cancel_in_flight)(BlockDriverState *bs); + + int (*bdrv_inactivate)(BlockDriverState *bs); + + int (*bdrv_snapshot_create)(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info); + int (*bdrv_snapshot_goto)(BlockDriverState *bs, + const char *snapshot_id); + int (*bdrv_snapshot_delete)(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); + int (*bdrv_snapshot_list)(BlockDriverState *bs, + QEMUSnapshotInfo **psn_info); + int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); + + int (*bdrv_change_backing_file)(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt); + + /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */ + int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event, + const char *tag); + int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs, + const char *tag); + int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag); + bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag); + + void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp); + + /* + * Returns 1 if newly created images are guaranteed to contain only + * zeros, 0 otherwise. + */ + int (*bdrv_has_zero_init)(BlockDriverState *bs); + + /* + * Remove fd handlers, timers, and other event loop callbacks so the event + * loop is no longer in use. Called with no in-flight requests and in + * depth-first traversal order with parents before child nodes. + */ + void (*bdrv_detach_aio_context)(BlockDriverState *bs); + + /* + * Add fd handlers, timers, and other event loop callbacks so I/O requests + * can be processed again. Called with no in-flight requests and in + * depth-first traversal order with child nodes before parent nodes. + */ + void (*bdrv_attach_aio_context)(BlockDriverState *bs, + AioContext *new_context); + + /** + * Try to get @bs's logical and physical block size. + * On success, store them in @bsz and return zero. + * On failure, return negative errno. + */ + int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz); + /** + * Try to get @bs's geometry (cyls, heads, sectors) + * On success, store them in @geo and return 0. + * On failure return -errno. + * Only drivers that want to override guest geometry implement this + * callback; see hd_geometry_guess(). + */ + int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo); + + void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child, + Error **errp); + void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child, + Error **errp); + + /** + * Informs the block driver that a permission change is intended. The + * driver checks whether the change is permissible and may take other + * preparations for the change (e.g. get file system locks). This operation + * is always followed either by a call to either .bdrv_set_perm or + * .bdrv_abort_perm_update. + * + * Checks whether the requested set of cumulative permissions in @perm + * can be granted for accessing @bs and whether no other users are using + * permissions other than those given in @shared (both arguments take + * BLK_PERM_* bitmasks). + * + * If both conditions are met, 0 is returned. Otherwise, -errno is returned + * and errp is set to an error describing the conflict. + */ + int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm, + uint64_t shared, Error **errp); + + /** + * Called to inform the driver that the set of cumulative set of used + * permissions for @bs has changed to @perm, and the set of sharable + * permission to @shared. The driver can use this to propagate changes to + * its children (i.e. request permissions only if a parent actually needs + * them). + * + * This function is only invoked after bdrv_check_perm(), so block drivers + * may rely on preparations made in their .bdrv_check_perm implementation. + */ + void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared); + + /* + * Called to inform the driver that after a previous bdrv_check_perm() + * call, the permission update is not performed and any preparations made + * for it (e.g. taken file locks) need to be undone. + * + * This function can be called even for nodes that never saw a + * bdrv_check_perm() call. It is a no-op then. + */ + void (*bdrv_abort_perm_update)(BlockDriverState *bs); + + /** + * Returns in @nperm and @nshared the permissions that the driver for @bs + * needs on its child @c, based on the cumulative permissions requested by + * the parents in @parent_perm and @parent_shared. + * + * If @c is NULL, return the permissions for attaching a new child for the + * given @child_class and @role. + * + * If @reopen_queue is non-NULL, don't return the currently needed + * permissions, but those that will be needed after applying the + * @reopen_queue. + */ + void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c, + BdrvChildRole role, + BlockReopenQueue *reopen_queue, + uint64_t parent_perm, uint64_t parent_shared, + uint64_t *nperm, uint64_t *nshared); + + /** + * Register/unregister a buffer for I/O. For example, when the driver is + * interested to know the memory areas that will later be used in iovs, so + * that it can do IOMMU mapping with VFIO etc., in order to get better + * performance. In the case of VFIO drivers, this callback is used to do + * DMA mapping for hot buffers. + */ + void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size); + void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host); + + /* + * This field is modified only under the BQL, and is part of + * the global state. + */ + QLIST_ENTRY(BlockDriver) list; + + /* + * I/O API functions. These functions are thread-safe. + * + * See include/block/block-io.h for more information about + * the I/O API. + */ + + int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename); + + int coroutine_fn (*bdrv_co_amend)(BlockDriverState *bs, + BlockdevAmendOptions *opts, + bool force, + Error **errp); + + /* aio */ + BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque); + BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque); + BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque); + BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs, + int64_t offset, int bytes, + BlockCompletionFunc *cb, void *opaque); + + int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); + + /** + * @offset: position in bytes to read at + * @bytes: number of bytes to read + * @qiov: the buffers to fill with read data + * @flags: currently unused, always 0 + * + * @offset and @bytes will be a multiple of 'request_alignment', + * but the length of individual @qiov elements does not have to + * be a multiple. + * + * @bytes will always equal the total size of @qiov, and will be + * no larger than 'max_transfer'. + * + * The buffer in @qiov may point directly to guest memory. + */ + int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); + + int coroutine_fn (*bdrv_co_preadv_part)(BlockDriverState *bs, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, + BdrvRequestFlags flags); + + int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + int flags); + /** + * @offset: position in bytes to write at + * @bytes: number of bytes to write + * @qiov: the buffers containing data to write + * @flags: zero or more bits allowed by 'supported_write_flags' + * + * @offset and @bytes will be a multiple of 'request_alignment', + * but the length of individual @qiov elements does not have to + * be a multiple. + * + * @bytes will always equal the total size of @qiov, and will be + * no larger than 'max_transfer'. + * + * The buffer in @qiov may point directly to guest memory. + */ + int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); + int coroutine_fn (*bdrv_co_pwritev_part)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, + BdrvRequestFlags flags); + + /* + * Efficiently zero a region of the disk image. Typically an image format + * would use a compact metadata representation to implement this. This + * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev() + * will be called instead. + */ + int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, + int64_t offset, int64_t bytes, BdrvRequestFlags flags); + int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, + int64_t offset, int64_t bytes); + + /* + * Map [offset, offset + nbytes) range onto a child of @bs to copy from, + * and invoke bdrv_co_copy_range_from(child, ...), or invoke + * bdrv_co_copy_range_to() if @bs is the leaf child to copy data from. + * + * See the comment of bdrv_co_copy_range for the parameter and return value + * semantics. + */ + int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs, + BdrvChild *src, + int64_t offset, + BdrvChild *dst, + int64_t dst_offset, + int64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + + /* + * Map [offset, offset + nbytes) range onto a child of bs to copy data to, + * and invoke bdrv_co_copy_range_to(child, src, ...), or perform the copy + * operation if @bs is the leaf and @src has the same BlockDriver. Return + * -ENOTSUP if @bs is the leaf but @src has a different BlockDriver. + * + * See the comment of bdrv_co_copy_range for the parameter and return value + * semantics. + */ + int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs, + BdrvChild *src, + int64_t src_offset, + BdrvChild *dst, + int64_t dst_offset, + int64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + + /* + * Building block for bdrv_block_status[_above] and + * bdrv_is_allocated[_above]. The driver should answer only + * according to the current layer, and should only need to set + * BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID, + * and/or BDRV_BLOCK_RAW; if the current layer defers to a backing + * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). See + * block.h for the overall meaning of the bits. As a hint, the + * flag want_zero is true if the caller cares more about precise + * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for + * overall allocation (favor larger *pnum, perhaps by reporting + * _DATA instead of _ZERO). The block layer guarantees input + * clamped to bdrv_getlength() and aligned to request_alignment, + * as well as non-NULL pnum, map, and file; in turn, the driver + * must return an error or set pnum to an aligned non-zero value. + * + * Note that @bytes is just a hint on how big of a region the + * caller wants to inspect. It is not a limit on *pnum. + * Implementations are free to return larger values of *pnum if + * doing so does not incur a performance penalty. + * + * block/io.c's bdrv_co_block_status() will utilize an unclamped + * *pnum value for the block-status cache on protocol nodes, prior + * to clamping *pnum for return to its caller. + */ + int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs, + bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file); + + /* + * Snapshot-access API. + * + * Block-driver may provide snapshot-access API: special functions to access + * some internal "snapshot". The functions are similar with normal + * read/block_status/discard handler, but don't have any specific handling + * in generic block-layer: no serializing, no alignment, no tracked + * requests. So, block-driver that realizes these APIs is fully responsible + * for synchronization between snapshot-access API and normal IO requests. + * + * TODO: To be able to support qcow2's internal snapshots, this API will + * need to be extended to: + * - be able to select a specific snapshot + * - receive the snapshot's actual length (which may differ from bs's + * length) + */ + int coroutine_fn (*bdrv_co_preadv_snapshot)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset); + int coroutine_fn (*bdrv_co_snapshot_block_status)(BlockDriverState *bs, + bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file); + int coroutine_fn (*bdrv_co_pdiscard_snapshot)(BlockDriverState *bs, + int64_t offset, int64_t bytes); + + /* + * Invalidate any cached meta-data. + */ + void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs, + Error **errp); + + /* + * Flushes all data for all layers by calling bdrv_co_flush for underlying + * layers, if needed. This function is needed for deterministic + * synchronization of the flush finishing callback. + */ + int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs); + + /* Delete a created file. */ + int coroutine_fn (*bdrv_co_delete_file)(BlockDriverState *bs, + Error **errp); + + /* + * Flushes all data that was already written to the OS all the way down to + * the disk (for example file-posix.c calls fsync()). + */ + int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs); + + /* + * Flushes all internal caches to the OS. The data may still sit in a + * writeback cache of the host OS, but it will survive a crash of the qemu + * process. + */ + int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs); + + /* + * Truncate @bs to @offset bytes using the given @prealloc mode + * when growing. Modes other than PREALLOC_MODE_OFF should be + * rejected when shrinking @bs. + * + * If @exact is true, @bs must be resized to exactly @offset. + * Otherwise, it is sufficient for @bs (if it is a host block + * device and thus there is no way to resize it) to be at least + * @offset bytes in length. + * + * If @exact is true and this function fails but would succeed + * with @exact = false, it should return -ENOTSUP. + */ + int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp); + int64_t (*bdrv_getlength)(BlockDriverState *bs); + int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs); + BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs, + Error **errp); + + int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov); + int coroutine_fn (*bdrv_co_pwritev_compressed_part)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + size_t qiov_offset); + + int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi); + + ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs, + Error **errp); + BlockStatsSpecific *(*bdrv_get_specific_stats)(BlockDriverState *bs); + + int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs, + QEMUIOVector *qiov, + int64_t pos); + int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs, + QEMUIOVector *qiov, + int64_t pos); + + /* removable device specific */ + bool (*bdrv_is_inserted)(BlockDriverState *bs); + void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); + void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked); + + /* to control generic scsi devices */ + BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque); + int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs, + unsigned long int req, void *buf); + + /* + * Returns 0 for completed check, -errno for internal errors. + * The check results are stored in result. + */ + int coroutine_fn (*bdrv_co_check)(BlockDriverState *bs, + BdrvCheckResult *result, + BdrvCheckMode fix); + + void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event); + + /* io queue for linux-aio */ + void (*bdrv_io_plug)(BlockDriverState *bs); + void (*bdrv_io_unplug)(BlockDriverState *bs); + + /** + * bdrv_co_drain_begin is called if implemented in the beginning of a + * drain operation to drain and stop any internal sources of requests in + * the driver. + * bdrv_co_drain_end is called if implemented at the end of the drain. + * + * They should be used by the driver to e.g. manage scheduled I/O + * requests, or toggle an internal state. After the end of the drain new + * requests will continue normally. + */ + void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs); + void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs); + + bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs); + bool (*bdrv_co_can_store_new_dirty_bitmap)(BlockDriverState *bs, + const char *name, + uint32_t granularity, + Error **errp); + int (*bdrv_co_remove_persistent_dirty_bitmap)(BlockDriverState *bs, + const char *name, + Error **errp); +}; + +static inline bool block_driver_can_compress(BlockDriver *drv) +{ + return drv->bdrv_co_pwritev_compressed || + drv->bdrv_co_pwritev_compressed_part; +} + +typedef struct BlockLimits { + /* + * Alignment requirement, in bytes, for offset/length of I/O + * requests. Must be a power of 2 less than INT_MAX; defaults to + * 1 for drivers with modern byte interfaces, and to 512 + * otherwise. + */ + uint32_t request_alignment; + + /* + * Maximum number of bytes that can be discarded at once. Must be multiple + * of pdiscard_alignment, but need not be power of 2. May be 0 if no + * inherent 64-bit limit. + */ + int64_t max_pdiscard; + + /* + * Optimal alignment for discard requests in bytes. A power of 2 + * is best but not mandatory. Must be a multiple of + * bl.request_alignment, and must be less than max_pdiscard if + * that is set. May be 0 if bl.request_alignment is good enough + */ + uint32_t pdiscard_alignment; + + /* + * Maximum number of bytes that can zeroized at once. Must be multiple of + * pwrite_zeroes_alignment. 0 means no limit. + */ + int64_t max_pwrite_zeroes; + + /* + * Optimal alignment for write zeroes requests in bytes. A power + * of 2 is best but not mandatory. Must be a multiple of + * bl.request_alignment, and must be less than max_pwrite_zeroes + * if that is set. May be 0 if bl.request_alignment is good + * enough + */ + uint32_t pwrite_zeroes_alignment; + + /* + * Optimal transfer length in bytes. A power of 2 is best but not + * mandatory. Must be a multiple of bl.request_alignment, or 0 if + * no preferred size + */ + uint32_t opt_transfer; + + /* + * Maximal transfer length in bytes. Need not be power of 2, but + * must be multiple of opt_transfer and bl.request_alignment, or 0 + * for no 32-bit limit. For now, anything larger than INT_MAX is + * clamped down. + */ + uint32_t max_transfer; + + /* + * Maximal hardware transfer length in bytes. Applies whenever + * transfers to the device bypass the kernel I/O scheduler, for + * example with SG_IO. If larger than max_transfer or if zero, + * blk_get_max_hw_transfer will fall back to max_transfer. + */ + uint64_t max_hw_transfer; + + /* + * Maximal number of scatter/gather elements allowed by the hardware. + * Applies whenever transfers to the device bypass the kernel I/O + * scheduler, for example with SG_IO. If larger than max_iov + * or if zero, blk_get_max_hw_iov will fall back to max_iov. + */ + int max_hw_iov; + + + /* memory alignment, in bytes so that no bounce buffer is needed */ + size_t min_mem_alignment; + + /* memory alignment, in bytes, for bounce buffer */ + size_t opt_mem_alignment; + + /* maximum number of iovec elements */ + int max_iov; +} BlockLimits; + +typedef struct BdrvOpBlocker BdrvOpBlocker; + +typedef struct BdrvAioNotifier { + void (*attached_aio_context)(AioContext *new_context, void *opaque); + void (*detach_aio_context)(void *opaque); + + void *opaque; + bool deleted; + + QLIST_ENTRY(BdrvAioNotifier) list; +} BdrvAioNotifier; + +struct BdrvChildClass { + /* + * If true, bdrv_replace_node() doesn't change the node this BdrvChild + * points to. + */ + bool stay_at_node; + + /* + * If true, the parent is a BlockDriverState and bdrv_next_all_states() + * will return it. This information is used for drain_all, where every node + * will be drained separately, so the drain only needs to be propagated to + * non-BDS parents. + */ + bool parent_is_bds; + + /* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + void (*inherit_options)(BdrvChildRole role, bool parent_is_format, + int *child_flags, QDict *child_options, + int parent_flags, QDict *parent_options); + void (*change_media)(BdrvChild *child, bool load); + + /* + * Returns a malloced string that describes the parent of the child for a + * human reader. This could be a node-name, BlockBackend name, qdev ID or + * QOM path of the device owning the BlockBackend, job type and ID etc. The + * caller is responsible for freeing the memory. + */ + char *(*get_parent_desc)(BdrvChild *child); + + /* + * Notifies the parent that the child has been activated/inactivated (e.g. + * when migration is completing) and it can start/stop requesting + * permissions and doing I/O on it. + */ + void (*activate)(BdrvChild *child, Error **errp); + int (*inactivate)(BdrvChild *child); + + void (*attach)(BdrvChild *child); + void (*detach)(BdrvChild *child); + + /* + * Notifies the parent that the filename of its child has changed (e.g. + * because the direct child was removed from the backing chain), so that it + * can update its reference. + */ + int (*update_filename)(BdrvChild *child, BlockDriverState *new_base, + const char *filename, Error **errp); + + bool (*can_set_aio_ctx)(BdrvChild *child, AioContext *ctx, + GSList **ignore, Error **errp); + void (*set_aio_ctx)(BdrvChild *child, AioContext *ctx, GSList **ignore); + + AioContext *(*get_parent_aio_context)(BdrvChild *child); + + /* + * I/O API functions. These functions are thread-safe. + * + * See include/block/block-io.h for more information about + * the I/O API. + */ + + void (*resize)(BdrvChild *child); + + /* + * Returns a name that is supposedly more useful for human users than the + * node name for identifying the node in question (in particular, a BB + * name), or NULL if the parent can't provide a better name. + */ + const char *(*get_name)(BdrvChild *child); + + /* + * If this pair of functions is implemented, the parent doesn't issue new + * requests after returning from .drained_begin() until .drained_end() is + * called. + * + * These functions must not change the graph (and therefore also must not + * call aio_poll(), which could change the graph indirectly). + * + * If drained_end() schedules background operations, it must atomically + * increment *drained_end_counter for each such operation and atomically + * decrement it once the operation has settled. + * + * Note that this can be nested. If drained_begin() was called twice, new + * I/O is allowed only after drained_end() was called twice, too. + */ + void (*drained_begin)(BdrvChild *child); + void (*drained_end)(BdrvChild *child, int *drained_end_counter); + + /* + * Returns whether the parent has pending requests for the child. This + * callback is polled after .drained_begin() has been called until all + * activity on the child has stopped. + */ + bool (*drained_poll)(BdrvChild *child); +}; + +extern const BdrvChildClass child_of_bds; + +struct BdrvChild { + BlockDriverState *bs; + char *name; + const BdrvChildClass *klass; + BdrvChildRole role; + void *opaque; + + /** + * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask) + */ + uint64_t perm; + + /** + * Permissions that can still be granted to other users of @bs while this + * BdrvChild is still attached to it. (BLK_PERM_* bitmask) + */ + uint64_t shared_perm; + + /* + * This link is frozen: the child can neither be replaced nor + * detached from the parent. + */ + bool frozen; + + /* + * How many times the parent of this child has been drained + * (through klass->drained_*). + * Usually, this is equal to bs->quiesce_counter (potentially + * reduced by bdrv_drain_all_count). It may differ while the + * child is entering or leaving a drained section. + */ + int parent_quiesce_counter; + + QLIST_ENTRY(BdrvChild) next; + QLIST_ENTRY(BdrvChild) next_parent; +}; + +/* + * Allows bdrv_co_block_status() to cache one data region for a + * protocol node. + * + * @valid: Whether the cache is valid (should be accessed with atomic + * functions so this can be reset by RCU readers) + * @data_start: Offset where we know (or strongly assume) is data + * @data_end: Offset where the data region ends (which is not necessarily + * the start of a zeroed region) + */ +typedef struct BdrvBlockStatusCache { + struct rcu_head rcu; + + bool valid; + int64_t data_start; + int64_t data_end; +} BdrvBlockStatusCache; + +struct BlockDriverState { + /* + * Protected by big QEMU lock or read-only after opening. No special + * locking needed during I/O... + */ + int open_flags; /* flags used to open the file, re-used for re-open */ + bool encrypted; /* if true, the media is encrypted */ + bool sg; /* if true, the device is a /dev/sg* */ + bool probed; /* if true, format was probed rather than specified */ + bool force_share; /* if true, always allow all shared permissions */ + bool implicit; /* if true, this filter node was automatically inserted */ + + BlockDriver *drv; /* NULL means no media */ + void *opaque; + + AioContext *aio_context; /* event loop used for fd handlers, timers, etc */ + /* + * long-running tasks intended to always use the same AioContext as this + * BDS may register themselves in this list to be notified of changes + * regarding this BDS's context + */ + QLIST_HEAD(, BdrvAioNotifier) aio_notifiers; + bool walking_aio_notifiers; /* to make removal during iteration safe */ + + char filename[PATH_MAX]; + /* + * If not empty, this image is a diff in relation to backing_file. + * Note that this is the name given in the image header and + * therefore may or may not be equal to .backing->bs->filename. + * If this field contains a relative path, it is to be resolved + * relatively to the overlay's location. + */ + char backing_file[PATH_MAX]; + /* + * The backing filename indicated by the image header. Contrary + * to backing_file, if we ever open this file, auto_backing_file + * is replaced by the resulting BDS's filename (i.e. after a + * bdrv_refresh_filename() run). + */ + char auto_backing_file[PATH_MAX]; + char backing_format[16]; /* if non-zero and backing_file exists */ + + QDict *full_open_options; + char exact_filename[PATH_MAX]; + + BdrvChild *backing; + BdrvChild *file; + + /* I/O Limits */ + BlockLimits bl; + + /* + * Flags honored during pread + */ + unsigned int supported_read_flags; + /* + * Flags honored during pwrite (so far: BDRV_REQ_FUA, + * BDRV_REQ_WRITE_UNCHANGED). + * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those + * writes will be issued as normal writes without the flag set. + * This is important to note for drivers that do not explicitly + * request a WRITE permission for their children and instead take + * the same permissions as their parent did (this is commonly what + * block filters do). Such drivers have to be aware that the + * parent may have taken a WRITE_UNCHANGED permission only and is + * issuing such requests. Drivers either must make sure that + * these requests do not result in plain WRITE accesses (usually + * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding + * every incoming write request as-is, including potentially that + * flag), or they have to explicitly take the WRITE permission for + * their children. + */ + unsigned int supported_write_flags; + /* + * Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, + * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) + */ + unsigned int supported_zero_flags; + /* + * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE). + * + * If BDRV_REQ_ZERO_WRITE is given, the truncate operation must make sure + * that any added space reads as all zeros. If this can't be guaranteed, + * the operation must fail. + */ + unsigned int supported_truncate_flags; + + /* the following member gives a name to every node on the bs graph. */ + char node_name[32]; + /* element of the list of named nodes building the graph */ + QTAILQ_ENTRY(BlockDriverState) node_list; + /* element of the list of all BlockDriverStates (all_bdrv_states) */ + QTAILQ_ENTRY(BlockDriverState) bs_list; + /* element of the list of monitor-owned BDS */ + QTAILQ_ENTRY(BlockDriverState) monitor_list; + int refcnt; + + /* operation blockers. Protected by BQL. */ + QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX]; + + /* + * The node that this node inherited default options from (and a reopen on + * which can affect this node by changing these defaults). This is always a + * parent node of this node. + */ + BlockDriverState *inherits_from; + QLIST_HEAD(, BdrvChild) children; + QLIST_HEAD(, BdrvChild) parents; + + QDict *options; + QDict *explicit_options; + BlockdevDetectZeroesOptions detect_zeroes; + + /* The error object in use for blocking operations on backing_hd */ + Error *backing_blocker; + + /* Protected by AioContext lock */ + + /* + * If we are reading a disk image, give its size in sectors. + * Generally read-only; it is written to by load_snapshot and + * save_snaphost, but the block layer is quiescent during those. + */ + int64_t total_sectors; + + /* threshold limit for writes, in bytes. "High water mark". */ + uint64_t write_threshold_offset; + + /* + * Writing to the list requires the BQL _and_ the dirty_bitmap_mutex. + * Reading from the list can be done with either the BQL or the + * dirty_bitmap_mutex. Modifying a bitmap only requires + * dirty_bitmap_mutex. + */ + QemuMutex dirty_bitmap_mutex; + QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; + + /* Offset after the highest byte written to */ + Stat64 wr_highest_offset; + + /* + * If true, copy read backing sectors into image. Can be >1 if more + * than one client has requested copy-on-read. Accessed with atomic + * ops. + */ + int copy_on_read; + + /* + * number of in-flight requests; overall and serialising. + * Accessed with atomic ops. + */ + unsigned int in_flight; + unsigned int serialising_in_flight; + + /* + * counter for nested bdrv_io_plug. + * Accessed with atomic ops. + */ + unsigned io_plugged; + + /* do we need to tell the quest if we have a volatile write cache? */ + int enable_write_cache; + + /* Accessed with atomic ops. */ + int quiesce_counter; + int recursive_quiesce_counter; + + unsigned int write_gen; /* Current data generation */ + + /* Protected by reqs_lock. */ + CoMutex reqs_lock; + QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; + CoQueue flush_queue; /* Serializing flush queue */ + bool active_flush_req; /* Flush request in flight? */ + + /* Only read/written by whoever has set active_flush_req to true. */ + unsigned int flushed_gen; /* Flushed write generation */ + + /* BdrvChild links to this node may never be frozen */ + bool never_freeze; + + /* Lock for block-status cache RCU writers */ + CoMutex bsc_modify_lock; + /* Always non-NULL, but must only be dereferenced under an RCU read guard */ + BdrvBlockStatusCache *block_status_cache; +}; + +struct BlockBackendRootState { + int open_flags; + BlockdevDetectZeroesOptions detect_zeroes; +}; + +typedef enum BlockMirrorBackingMode { + /* + * Reuse the existing backing chain from the source for the target. + * - sync=full: Set backing BDS to NULL. + * - sync=top: Use source's backing BDS. + * - sync=none: Use source as the backing BDS. + */ + MIRROR_SOURCE_BACKING_CHAIN, + + /* Open the target's backing chain completely anew */ + MIRROR_OPEN_BACKING_CHAIN, + + /* Do not change the target's backing BDS after job completion */ + MIRROR_LEAVE_BACKING_CHAIN, +} BlockMirrorBackingMode; + + +/* + * Essential block drivers which must always be statically linked into qemu, and + * which therefore can be accessed without using bdrv_find_format() + */ +extern BlockDriver bdrv_file; +extern BlockDriver bdrv_raw; +extern BlockDriver bdrv_qcow2; + +extern unsigned int bdrv_drain_all_count; +extern QemuOptsList bdrv_create_opts_simple; + +/* + * Common functions that are neither I/O nor Global State. + * + * See include/block/block-commmon.h for more information about + * the Common API. + */ + +static inline BlockDriverState *child_bs(BdrvChild *child) +{ + return child ? child->bs : NULL; +} + +int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp); +int get_tmp_filename(char *filename, int size); +void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix, + QDict *options); + + +int bdrv_check_qiov_request(int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, + Error **errp); + +#ifdef _WIN32 +int is_windows_drive(const char *filename); +#endif + +#endif /* BLOCK_INT_COMMON_H */ diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h new file mode 100644 index 0000000..0f21b05 --- /dev/null +++ b/include/block/block_int-global-state.h @@ -0,0 +1,329 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCK_INT_GLOBAL_STATE_H +#define BLOCK_INT_GLOBAL_STATE_H + +#include "block_int-common.h" + +/* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + +/** + * stream_start: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Block device to operate on. + * @base: Block device that will become the new base, or %NULL to + * flatten the whole backing file chain onto @bs. + * @backing_file_str: The file name that will be written to @bs as the + * the new backing file if the job completes. Ignored if @base is %NULL. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the stream job inserts into the graph above + * @bs. NULL means that a node name should be autogenerated. + * @errp: Error object. + * + * Start a streaming operation on @bs. Clusters that are unallocated + * in @bs, but allocated in any image between @base and @bs (both + * exclusive) will be written to @bs. At the end of a successful + * streaming job, the backing file of @bs will be changed to + * @backing_file_str in the written image and to @base in the live + * BlockDriverState. + */ +void stream_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *base, const char *backing_file_str, + BlockDriverState *bottom, + int creation_flags, int64_t speed, + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp); + +/** + * commit_start: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Active block device. + * @top: Top block device to be committed. + * @base: Block device that will be written into, and become the new top. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_error: The action to take upon error. + * @backing_file_str: String to use as the backing file in @top's overlay + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @top. NULL means + * that a node name should be autogenerated. + * @errp: Error object. + * + */ +void commit_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *base, BlockDriverState *top, + int creation_flags, int64_t speed, + BlockdevOnError on_error, const char *backing_file_str, + const char *filter_node_name, Error **errp); +/** + * commit_active_start: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Active block device to be committed. + * @base: Block device that will be written into, and become the new top. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @bs. NULL means that + * a node name should be autogenerated. + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @auto_complete: Auto complete the job. + * @errp: Error object. + * + */ +BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *base, int creation_flags, + int64_t speed, BlockdevOnError on_error, + const char *filter_node_name, + BlockCompletionFunc *cb, void *opaque, + bool auto_complete, Error **errp); +/* + * mirror_start: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Block device to operate on. + * @target: Block device to write to. + * @replaces: Block graph node name to replace once the mirror is done. Can + * only be used when full mirroring is selected. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @granularity: The chosen granularity for the dirty bitmap. + * @buf_size: The amount of data that can be in flight at one time. + * @mode: Whether to collapse all images in the chain to the target. + * @backing_mode: How to establish the target's backing chain after completion. + * @zero_target: Whether the target should be explicitly zero-initialized + * @on_source_error: The action to take upon error reading from the source. + * @on_target_error: The action to take upon error writing to the target. + * @unmap: Whether to unmap target where source sectors only contain zeroes. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the mirror job inserts into the graph above @bs. NULL means that + * a node name should be autogenerated. + * @copy_mode: When to trigger writes to the target. + * @errp: Error object. + * + * Start a mirroring operation on @bs. Clusters that are allocated + * in @bs will be written to @target until the job is cancelled or + * manually completed. At the end of a successful mirroring job, + * @bs will be switched to read from @target. + */ +void mirror_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *target, const char *replaces, + int creation_flags, int64_t speed, + uint32_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, + bool zero_target, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, const char *filter_node_name, + MirrorCopyMode copy_mode, Error **errp); + +/* + * backup_job_create: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Block device to operate on. + * @target: Block device to write to. + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @sync_mode: What parts of the disk image should be copied to the destination. + * @sync_bitmap: The dirty bitmap if sync_mode is 'bitmap' or 'incremental' + * @bitmap_mode: The bitmap synchronization policy to use. + * @perf: Performance options. All actual fields assumed to be present, + * all ".has_*" fields are ignored. + * @on_source_error: The action to take upon error reading from the source. + * @on_target_error: The action to take upon error writing to the target. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @txn: Transaction that this job is part of (may be NULL). + * + * Create a backup operation on @bs. Clusters in @bs are written to @target + * until the job is cancelled or manually completed. + */ +BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, + BlockDriverState *target, int64_t speed, + MirrorSyncMode sync_mode, + BdrvDirtyBitmap *sync_bitmap, + BitmapSyncMode bitmap_mode, + bool compress, + const char *filter_node_name, + BackupPerf *perf, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + int creation_flags, + BlockCompletionFunc *cb, void *opaque, + JobTxn *txn, Error **errp); + +BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, + const char *child_name, + const BdrvChildClass *child_class, + BdrvChildRole child_role, + uint64_t perm, uint64_t shared_perm, + void *opaque, Error **errp); +void bdrv_root_unref_child(BdrvChild *child); + +void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, + uint64_t *shared_perm); + +/** + * Sets a BdrvChild's permissions. Avoid if the parent is a BDS; use + * bdrv_child_refresh_perms() instead and make the parent's + * .bdrv_child_perm() implementation return the correct values. + */ +int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared, + Error **errp); + +/** + * Calls bs->drv->bdrv_child_perm() and updates the child's permission + * masks with the result. + * Drivers should invoke this function whenever an event occurs that + * makes their .bdrv_child_perm() implementation return different + * values than before, but which will not result in the block layer + * automatically refreshing the permissions. + */ +int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp); + +bool bdrv_recurse_can_replace(BlockDriverState *bs, + BlockDriverState *to_replace); + +/* + * Default implementation for BlockDriver.bdrv_child_perm() that can + * be used by block filters and image formats, as long as they use the + * child_of_bds child class and set an appropriate BdrvChildRole. + */ +void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c, + BdrvChildRole role, BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, + uint64_t *nperm, uint64_t *nshared); + +void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp); +bool blk_dev_has_removable_media(BlockBackend *blk); +void blk_dev_eject_request(BlockBackend *blk, bool force); +bool blk_dev_is_medium_locked(BlockBackend *blk); + +void bdrv_restore_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *backup); + +void bdrv_set_monitor_owned(BlockDriverState *bs); + +void blockdev_close_all_bdrv_states(void); + +BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp); + +/** + * Simple implementation of bdrv_co_create_opts for protocol drivers + * which only support creation via opening a file + * (usually existing raw storage device) + */ +int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, + const char *filename, + QemuOpts *opts, + Error **errp); + +BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, + const char *name, + BlockDriverState **pbs, + Error **errp); +BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, + BlockDirtyBitmapMergeSourceList *bms, + HBitmap **backup, Error **errp); +BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, + bool release, + BlockDriverState **bitmap_bs, + Error **errp); + + +BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs); + +/** + * bdrv_add_aio_context_notifier: + * + * If a long-running job intends to be always run in the same AioContext as a + * certain BDS, it may use this function to be notified of changes regarding the + * association of the BDS to an AioContext. + * + * attached_aio_context() is called after the target BDS has been attached to a + * new AioContext; detach_aio_context() is called before the target BDS is being + * detached from its old AioContext. + */ +void bdrv_add_aio_context_notifier(BlockDriverState *bs, + void (*attached_aio_context)(AioContext *new_context, void *opaque), + void (*detach_aio_context)(void *opaque), void *opaque); + +/** + * bdrv_remove_aio_context_notifier: + * + * Unsubscribe of change notifications regarding the BDS's AioContext. The + * parameters given here have to be the same as those given to + * bdrv_add_aio_context_notifier(). + */ +void bdrv_remove_aio_context_notifier(BlockDriverState *bs, + void (*aio_context_attached)(AioContext *, + void *), + void (*aio_context_detached)(void *), + void *opaque); + +/** + * End all quiescent sections started by bdrv_drain_all_begin(). This is + * needed when deleting a BDS before bdrv_drain_all_end() is called. + * + * NOTE: this is an internal helper for bdrv_close() *only*. No one else + * should call it. + */ +void bdrv_drain_all_end_quiesce(BlockDriverState *bs); + +/** + * Make sure that the function is running under both drain and BQL. + * The latter protects from concurrent writings + * from the GS API, while the former prevents concurrent reads + * from I/O. + */ +static inline void assert_bdrv_graph_writable(BlockDriverState *bs) +{ + /* + * TODO: this function is incomplete. Because the users of this + * assert lack the necessary drains, check only for BQL. + * Once the necessary drains are added, + * assert also for qatomic_read(&bs->quiesce_counter) > 0 + */ + assert(qemu_in_main_thread()); +} + +#endif /* BLOCK_INT_GLOBAL_STATE */ diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h new file mode 100644 index 0000000..bb45420 --- /dev/null +++ b/include/block/block_int-io.h @@ -0,0 +1,194 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCK_INT_IO_H +#define BLOCK_INT_IO_H + +#include "block_int-common.h" + +/* + * I/O API functions. These functions are thread-safe. + * + * See include/block/block-io.h for more information about + * the I/O API. + */ + +int coroutine_fn bdrv_co_preadv_snapshot(BdrvChild *child, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset); +int coroutine_fn bdrv_co_snapshot_block_status(BlockDriverState *bs, + bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file); +int coroutine_fn bdrv_co_pdiscard_snapshot(BlockDriverState *bs, + int64_t offset, int64_t bytes); + + +int coroutine_fn bdrv_co_preadv(BdrvChild *child, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); +int coroutine_fn bdrv_co_pwritev(BdrvChild *child, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); + +static inline int coroutine_fn bdrv_co_pread(BdrvChild *child, + int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags) +{ + QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); + IO_CODE(); + + return bdrv_co_preadv(child, offset, bytes, &qiov, flags); +} + +static inline int coroutine_fn bdrv_co_pwrite(BdrvChild *child, + int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags) +{ + QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); + IO_CODE(); + + return bdrv_co_pwritev(child, offset, bytes, &qiov, flags); +} + +bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, + uint64_t align); +BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs); + +BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, + const char *filename); + +/** + * bdrv_wakeup: + * @bs: The BlockDriverState for which an I/O operation has been completed. + * + * Wake up the main thread if it is waiting on BDRV_POLL_WHILE. During + * synchronous I/O on a BlockDriverState that is attached to another + * I/O thread, the main thread lets the I/O thread's event loop run, + * waiting for the I/O operation to complete. A bdrv_wakeup will wake + * up the main thread if necessary. + * + * Manual calls to bdrv_wakeup are rarely necessary, because + * bdrv_dec_in_flight already calls it. + */ +void bdrv_wakeup(BlockDriverState *bs); + +const char *bdrv_get_parent_name(const BlockDriverState *bs); +bool blk_dev_has_tray(BlockBackend *blk); +bool blk_dev_is_tray_open(BlockBackend *blk); + +void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes); + +void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out); +bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest, + const BdrvDirtyBitmap *src, + HBitmap **backup, bool lock); + +void bdrv_inc_in_flight(BlockDriverState *bs); +void bdrv_dec_in_flight(BlockDriverState *bs); + +int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); +int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + +int refresh_total_sectors(BlockDriverState *bs, int64_t hint); + +BdrvChild *bdrv_cow_child(BlockDriverState *bs); +BdrvChild *bdrv_filter_child(BlockDriverState *bs); +BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs); +BdrvChild *bdrv_primary_child(BlockDriverState *bs); +BlockDriverState *bdrv_skip_filters(BlockDriverState *bs); +BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs); + +static inline BlockDriverState *bdrv_cow_bs(BlockDriverState *bs) +{ + IO_CODE(); + return child_bs(bdrv_cow_child(bs)); +} + +static inline BlockDriverState *bdrv_filter_bs(BlockDriverState *bs) +{ + IO_CODE(); + return child_bs(bdrv_filter_child(bs)); +} + +static inline BlockDriverState *bdrv_filter_or_cow_bs(BlockDriverState *bs) +{ + IO_CODE(); + return child_bs(bdrv_filter_or_cow_child(bs)); +} + +static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs) +{ + IO_CODE(); + return child_bs(bdrv_primary_child(bs)); +} + +/** + * Check whether the given offset is in the cached block-status data + * region. + * + * If it is, and @pnum is not NULL, *pnum is set to + * `bsc.data_end - offset`, i.e. how many bytes, starting from + * @offset, are data (according to the cache). + * Otherwise, *pnum is not touched. + */ +bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum); + +/** + * If [offset, offset + bytes) overlaps with the currently cached + * block-status region, invalidate the cache. + * + * (To be used by I/O paths that cause data regions to be zero or + * holes.) + */ +void bdrv_bsc_invalidate_range(BlockDriverState *bs, + int64_t offset, int64_t bytes); + +/** + * Mark the range [offset, offset + bytes) as a data region. + */ +void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes); + + +/* + * "I/O or GS" API functions. These functions can run without + * the BQL, but only in one specific iothread/main loop. + * + * See include/block/block-io.h for more information about + * the "I/O or GS" API. + */ + +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent); +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent); + +#endif /* BLOCK_INT_IO_H */ diff --git a/include/block/block_int.h b/include/block/block_int.h index 27008cf..7d50b6b 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -24,1478 +24,9 @@ #ifndef BLOCK_INT_H #define BLOCK_INT_H -#include "block/accounting.h" -#include "block/block.h" -#include "block/aio-wait.h" -#include "qemu/queue.h" -#include "qemu/coroutine.h" -#include "qemu/stats64.h" -#include "qemu/timer.h" -#include "qemu/hbitmap.h" -#include "block/snapshot.h" -#include "qemu/throttle.h" -#include "qemu/rcu.h" +#include "block_int-global-state.h" +#include "block_int-io.h" -#define BLOCK_FLAG_LAZY_REFCOUNTS 8 - -#define BLOCK_OPT_SIZE "size" -#define BLOCK_OPT_ENCRYPT "encryption" -#define BLOCK_OPT_ENCRYPT_FORMAT "encrypt.format" -#define BLOCK_OPT_COMPAT6 "compat6" -#define BLOCK_OPT_HWVERSION "hwversion" -#define BLOCK_OPT_BACKING_FILE "backing_file" -#define BLOCK_OPT_BACKING_FMT "backing_fmt" -#define BLOCK_OPT_CLUSTER_SIZE "cluster_size" -#define BLOCK_OPT_TABLE_SIZE "table_size" -#define BLOCK_OPT_PREALLOC "preallocation" -#define BLOCK_OPT_SUBFMT "subformat" -#define BLOCK_OPT_COMPAT_LEVEL "compat" -#define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts" -#define BLOCK_OPT_ADAPTER_TYPE "adapter_type" -#define BLOCK_OPT_REDUNDANCY "redundancy" -#define BLOCK_OPT_NOCOW "nocow" -#define BLOCK_OPT_EXTENT_SIZE_HINT "extent_size_hint" -#define BLOCK_OPT_OBJECT_SIZE "object_size" -#define BLOCK_OPT_REFCOUNT_BITS "refcount_bits" -#define BLOCK_OPT_DATA_FILE "data_file" -#define BLOCK_OPT_DATA_FILE_RAW "data_file_raw" -#define BLOCK_OPT_COMPRESSION_TYPE "compression_type" -#define BLOCK_OPT_EXTL2 "extended_l2" - -#define BLOCK_PROBE_BUF_SIZE 512 - -enum BdrvTrackedRequestType { - BDRV_TRACKED_READ, - BDRV_TRACKED_WRITE, - BDRV_TRACKED_DISCARD, - BDRV_TRACKED_TRUNCATE, -}; - -/* - * That is not quite good that BdrvTrackedRequest structure is public, - * as block/io.c is very careful about incoming offset/bytes being - * correct. Be sure to assert bdrv_check_request() succeeded after any - * modification of BdrvTrackedRequest object out of block/io.c - */ -typedef struct BdrvTrackedRequest { - BlockDriverState *bs; - int64_t offset; - int64_t bytes; - enum BdrvTrackedRequestType type; - - bool serialising; - int64_t overlap_offset; - int64_t overlap_bytes; - - QLIST_ENTRY(BdrvTrackedRequest) list; - Coroutine *co; /* owner, used for deadlock detection */ - CoQueue wait_queue; /* coroutines blocked on this request */ - - struct BdrvTrackedRequest *waiting_for; -} BdrvTrackedRequest; - -int bdrv_check_qiov_request(int64_t offset, int64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, - Error **errp); -int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp); - -struct BlockDriver { - const char *format_name; - int instance_size; - - /* set to true if the BlockDriver is a block filter. Block filters pass - * certain callbacks that refer to data (see block.c) to their bs->file - * or bs->backing (whichever one exists) if the driver doesn't implement - * them. Drivers that do not wish to forward must implement them and return - * -ENOTSUP. - * Note that filters are not allowed to modify data. - * - * Filters generally cannot have more than a single filtered child, - * because the data they present must at all times be the same as - * that on their filtered child. That would be impossible to - * achieve for multiple filtered children. - * (And this filtered child must then be bs->file or bs->backing.) - */ - bool is_filter; - /* - * Set to true if the BlockDriver is a format driver. Format nodes - * generally do not expect their children to be other format nodes - * (except for backing files), and so format probing is disabled - * on those children. - */ - bool is_format; - /* - * Return true if @to_replace can be replaced by a BDS with the - * same data as @bs without it affecting @bs's behavior (that is, - * without it being visible to @bs's parents). - */ - bool (*bdrv_recurse_can_replace)(BlockDriverState *bs, - BlockDriverState *to_replace); - - int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename); - int (*bdrv_probe_device)(const char *filename); - - /* Any driver implementing this callback is expected to be able to handle - * NULL file names in its .bdrv_open() implementation */ - void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp); - /* Drivers not implementing bdrv_parse_filename nor bdrv_open should have - * this field set to true, except ones that are defined only by their - * child's bs. - * An example of the last type will be the quorum block driver. - */ - bool bdrv_needs_filename; - - /* - * Set if a driver can support backing files. This also implies the - * following semantics: - * - * - Return status 0 of .bdrv_co_block_status means that corresponding - * blocks are not allocated in this layer of backing-chain - * - For such (unallocated) blocks, read will: - * - fill buffer with zeros if there is no backing file - * - read from the backing file otherwise, where the block layer - * takes care of reading zeros beyond EOF if backing file is short - */ - bool supports_backing; - - /* For handling image reopen for split or non-split files */ - int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state, - BlockReopenQueue *queue, Error **errp); - void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state); - void (*bdrv_reopen_commit_post)(BDRVReopenState *reopen_state); - void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state); - void (*bdrv_join_options)(QDict *options, QDict *old_options); - - int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags, - Error **errp); - - /* Protocol drivers should implement this instead of bdrv_open */ - int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags, - Error **errp); - void (*bdrv_close)(BlockDriverState *bs); - - - int coroutine_fn (*bdrv_co_create)(BlockdevCreateOptions *opts, - Error **errp); - int coroutine_fn (*bdrv_co_create_opts)(BlockDriver *drv, - const char *filename, - QemuOpts *opts, - Error **errp); - - int coroutine_fn (*bdrv_co_amend)(BlockDriverState *bs, - BlockdevAmendOptions *opts, - bool force, - Error **errp); - - int (*bdrv_amend_options)(BlockDriverState *bs, - QemuOpts *opts, - BlockDriverAmendStatusCB *status_cb, - void *cb_opaque, - bool force, - Error **errp); - - int (*bdrv_make_empty)(BlockDriverState *bs); - - /* - * Refreshes the bs->exact_filename field. If that is impossible, - * bs->exact_filename has to be left empty. - */ - void (*bdrv_refresh_filename)(BlockDriverState *bs); - - /* - * Gathers the open options for all children into @target. - * A simple format driver (without backing file support) might - * implement this function like this: - * - * QINCREF(bs->file->bs->full_open_options); - * qdict_put(target, "file", bs->file->bs->full_open_options); - * - * If not specified, the generic implementation will simply put - * all children's options under their respective name. - * - * @backing_overridden is true when bs->backing seems not to be - * the child that would result from opening bs->backing_file. - * Therefore, if it is true, the backing child's options should be - * gathered; otherwise, there is no need since the backing child - * is the one implied by the image header. - * - * Note that ideally this function would not be needed. Every - * block driver which implements it is probably doing something - * shady regarding its runtime option structure. - */ - void (*bdrv_gather_child_options)(BlockDriverState *bs, QDict *target, - bool backing_overridden); - - /* - * Returns an allocated string which is the directory name of this BDS: It - * will be used to make relative filenames absolute by prepending this - * function's return value to them. - */ - char *(*bdrv_dirname)(BlockDriverState *bs, Error **errp); - - /* aio */ - BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs, - int64_t offset, int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque); - BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs, - int64_t offset, int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque); - BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque); - BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs, - int64_t offset, int bytes, - BlockCompletionFunc *cb, void *opaque); - - int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); - - /** - * @offset: position in bytes to read at - * @bytes: number of bytes to read - * @qiov: the buffers to fill with read data - * @flags: currently unused, always 0 - * - * @offset and @bytes will be a multiple of 'request_alignment', - * but the length of individual @qiov elements does not have to - * be a multiple. - * - * @bytes will always equal the total size of @qiov, and will be - * no larger than 'max_transfer'. - * - * The buffer in @qiov may point directly to guest memory. - */ - int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs, - int64_t offset, int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); - int coroutine_fn (*bdrv_co_preadv_part)(BlockDriverState *bs, - int64_t offset, int64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); - int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags); - /** - * @offset: position in bytes to write at - * @bytes: number of bytes to write - * @qiov: the buffers containing data to write - * @flags: zero or more bits allowed by 'supported_write_flags' - * - * @offset and @bytes will be a multiple of 'request_alignment', - * but the length of individual @qiov elements does not have to - * be a multiple. - * - * @bytes will always equal the total size of @qiov, and will be - * no larger than 'max_transfer'. - * - * The buffer in @qiov may point directly to guest memory. - */ - int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs, - int64_t offset, int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); - int coroutine_fn (*bdrv_co_pwritev_part)(BlockDriverState *bs, - int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, - BdrvRequestFlags flags); - - /* - * Efficiently zero a region of the disk image. Typically an image format - * would use a compact metadata representation to implement this. This - * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev() - * will be called instead. - */ - int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, - int64_t offset, int64_t bytes, BdrvRequestFlags flags); - int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, - int64_t offset, int64_t bytes); - - /* Map [offset, offset + nbytes) range onto a child of @bs to copy from, - * and invoke bdrv_co_copy_range_from(child, ...), or invoke - * bdrv_co_copy_range_to() if @bs is the leaf child to copy data from. - * - * See the comment of bdrv_co_copy_range for the parameter and return value - * semantics. - */ - int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs, - BdrvChild *src, - int64_t offset, - BdrvChild *dst, - int64_t dst_offset, - int64_t bytes, - BdrvRequestFlags read_flags, - BdrvRequestFlags write_flags); - - /* Map [offset, offset + nbytes) range onto a child of bs to copy data to, - * and invoke bdrv_co_copy_range_to(child, src, ...), or perform the copy - * operation if @bs is the leaf and @src has the same BlockDriver. Return - * -ENOTSUP if @bs is the leaf but @src has a different BlockDriver. - * - * See the comment of bdrv_co_copy_range for the parameter and return value - * semantics. - */ - int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs, - BdrvChild *src, - int64_t src_offset, - BdrvChild *dst, - int64_t dst_offset, - int64_t bytes, - BdrvRequestFlags read_flags, - BdrvRequestFlags write_flags); - - /* - * Building block for bdrv_block_status[_above] and - * bdrv_is_allocated[_above]. The driver should answer only - * according to the current layer, and should only need to set - * BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID, - * and/or BDRV_BLOCK_RAW; if the current layer defers to a backing - * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). See - * block.h for the overall meaning of the bits. As a hint, the - * flag want_zero is true if the caller cares more about precise - * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for - * overall allocation (favor larger *pnum, perhaps by reporting - * _DATA instead of _ZERO). The block layer guarantees input - * clamped to bdrv_getlength() and aligned to request_alignment, - * as well as non-NULL pnum, map, and file; in turn, the driver - * must return an error or set pnum to an aligned non-zero value. - * - * Note that @bytes is just a hint on how big of a region the - * caller wants to inspect. It is not a limit on *pnum. - * Implementations are free to return larger values of *pnum if - * doing so does not incur a performance penalty. - * - * block/io.c's bdrv_co_block_status() will utilize an unclamped - * *pnum value for the block-status cache on protocol nodes, prior - * to clamping *pnum for return to its caller. - */ - int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs, - bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, - int64_t *map, BlockDriverState **file); - - /* - * This informs the driver that we are no longer interested in the result - * of in-flight requests, so don't waste the time if possible. - * - * One example usage is to avoid waiting for an nbd target node reconnect - * timeout during job-cancel with force=true. - */ - void (*bdrv_cancel_in_flight)(BlockDriverState *bs); - - /* - * Invalidate any cached meta-data. - */ - void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs, - Error **errp); - int (*bdrv_inactivate)(BlockDriverState *bs); - - /* - * Flushes all data for all layers by calling bdrv_co_flush for underlying - * layers, if needed. This function is needed for deterministic - * synchronization of the flush finishing callback. - */ - int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs); - - /* Delete a created file. */ - int coroutine_fn (*bdrv_co_delete_file)(BlockDriverState *bs, - Error **errp); - - /* - * Flushes all data that was already written to the OS all the way down to - * the disk (for example file-posix.c calls fsync()). - */ - int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs); - - /* - * Flushes all internal caches to the OS. The data may still sit in a - * writeback cache of the host OS, but it will survive a crash of the qemu - * process. - */ - int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs); - - /* - * Drivers setting this field must be able to work with just a plain - * filename with '<protocol_name>:' as a prefix, and no other options. - * Options may be extracted from the filename by implementing - * bdrv_parse_filename. - */ - const char *protocol_name; - - /* - * Truncate @bs to @offset bytes using the given @prealloc mode - * when growing. Modes other than PREALLOC_MODE_OFF should be - * rejected when shrinking @bs. - * - * If @exact is true, @bs must be resized to exactly @offset. - * Otherwise, it is sufficient for @bs (if it is a host block - * device and thus there is no way to resize it) to be at least - * @offset bytes in length. - * - * If @exact is true and this function fails but would succeed - * with @exact = false, it should return -ENOTSUP. - */ - int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset, - bool exact, PreallocMode prealloc, - BdrvRequestFlags flags, Error **errp); - - int64_t (*bdrv_getlength)(BlockDriverState *bs); - bool has_variable_length; - int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs); - BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs, - Error **errp); - - int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs, - int64_t offset, int64_t bytes, QEMUIOVector *qiov); - int coroutine_fn (*bdrv_co_pwritev_compressed_part)(BlockDriverState *bs, - int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset); - - int (*bdrv_snapshot_create)(BlockDriverState *bs, - QEMUSnapshotInfo *sn_info); - int (*bdrv_snapshot_goto)(BlockDriverState *bs, - const char *snapshot_id); - int (*bdrv_snapshot_delete)(BlockDriverState *bs, - const char *snapshot_id, - const char *name, - Error **errp); - int (*bdrv_snapshot_list)(BlockDriverState *bs, - QEMUSnapshotInfo **psn_info); - int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs, - const char *snapshot_id, - const char *name, - Error **errp); - int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi); - ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs, - Error **errp); - BlockStatsSpecific *(*bdrv_get_specific_stats)(BlockDriverState *bs); - - int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs, - QEMUIOVector *qiov, - int64_t pos); - int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs, - QEMUIOVector *qiov, - int64_t pos); - - int (*bdrv_change_backing_file)(BlockDriverState *bs, - const char *backing_file, const char *backing_fmt); - - /* removable device specific */ - bool (*bdrv_is_inserted)(BlockDriverState *bs); - void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); - void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked); - - /* to control generic scsi devices */ - BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque); - int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs, - unsigned long int req, void *buf); - - /* List of options for creating images, terminated by name == NULL */ - QemuOptsList *create_opts; - - /* List of options for image amend */ - QemuOptsList *amend_opts; - - /* - * If this driver supports reopening images this contains a - * NULL-terminated list of the runtime options that can be - * modified. If an option in this list is unspecified during - * reopen then it _must_ be reset to its default value or return - * an error. - */ - const char *const *mutable_opts; - - /* - * Returns 0 for completed check, -errno for internal errors. - * The check results are stored in result. - */ - int coroutine_fn (*bdrv_co_check)(BlockDriverState *bs, - BdrvCheckResult *result, - BdrvCheckMode fix); - - void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event); - - /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */ - int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event, - const char *tag); - int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs, - const char *tag); - int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag); - bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag); - - void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp); - - /* - * Returns 1 if newly created images are guaranteed to contain only - * zeros, 0 otherwise. - */ - int (*bdrv_has_zero_init)(BlockDriverState *bs); - - /* Remove fd handlers, timers, and other event loop callbacks so the event - * loop is no longer in use. Called with no in-flight requests and in - * depth-first traversal order with parents before child nodes. - */ - void (*bdrv_detach_aio_context)(BlockDriverState *bs); - - /* Add fd handlers, timers, and other event loop callbacks so I/O requests - * can be processed again. Called with no in-flight requests and in - * depth-first traversal order with child nodes before parent nodes. - */ - void (*bdrv_attach_aio_context)(BlockDriverState *bs, - AioContext *new_context); - - /* io queue for linux-aio */ - void (*bdrv_io_plug)(BlockDriverState *bs); - void (*bdrv_io_unplug)(BlockDriverState *bs); - - /** - * Try to get @bs's logical and physical block size. - * On success, store them in @bsz and return zero. - * On failure, return negative errno. - */ - int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz); - /** - * Try to get @bs's geometry (cyls, heads, sectors) - * On success, store them in @geo and return 0. - * On failure return -errno. - * Only drivers that want to override guest geometry implement this - * callback; see hd_geometry_guess(). - */ - int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo); - - /** - * bdrv_co_drain_begin is called if implemented in the beginning of a - * drain operation to drain and stop any internal sources of requests in - * the driver. - * bdrv_co_drain_end is called if implemented at the end of the drain. - * - * They should be used by the driver to e.g. manage scheduled I/O - * requests, or toggle an internal state. After the end of the drain new - * requests will continue normally. - */ - void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs); - void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs); - - void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child, - Error **errp); - void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child, - Error **errp); - - /** - * Informs the block driver that a permission change is intended. The - * driver checks whether the change is permissible and may take other - * preparations for the change (e.g. get file system locks). This operation - * is always followed either by a call to either .bdrv_set_perm or - * .bdrv_abort_perm_update. - * - * Checks whether the requested set of cumulative permissions in @perm - * can be granted for accessing @bs and whether no other users are using - * permissions other than those given in @shared (both arguments take - * BLK_PERM_* bitmasks). - * - * If both conditions are met, 0 is returned. Otherwise, -errno is returned - * and errp is set to an error describing the conflict. - */ - int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm, - uint64_t shared, Error **errp); - - /** - * Called to inform the driver that the set of cumulative set of used - * permissions for @bs has changed to @perm, and the set of sharable - * permission to @shared. The driver can use this to propagate changes to - * its children (i.e. request permissions only if a parent actually needs - * them). - * - * This function is only invoked after bdrv_check_perm(), so block drivers - * may rely on preparations made in their .bdrv_check_perm implementation. - */ - void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared); - - /* - * Called to inform the driver that after a previous bdrv_check_perm() - * call, the permission update is not performed and any preparations made - * for it (e.g. taken file locks) need to be undone. - * - * This function can be called even for nodes that never saw a - * bdrv_check_perm() call. It is a no-op then. - */ - void (*bdrv_abort_perm_update)(BlockDriverState *bs); - - /** - * Returns in @nperm and @nshared the permissions that the driver for @bs - * needs on its child @c, based on the cumulative permissions requested by - * the parents in @parent_perm and @parent_shared. - * - * If @c is NULL, return the permissions for attaching a new child for the - * given @child_class and @role. - * - * If @reopen_queue is non-NULL, don't return the currently needed - * permissions, but those that will be needed after applying the - * @reopen_queue. - */ - void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c, - BdrvChildRole role, - BlockReopenQueue *reopen_queue, - uint64_t parent_perm, uint64_t parent_shared, - uint64_t *nperm, uint64_t *nshared); - - bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs); - bool (*bdrv_co_can_store_new_dirty_bitmap)(BlockDriverState *bs, - const char *name, - uint32_t granularity, - Error **errp); - int (*bdrv_co_remove_persistent_dirty_bitmap)(BlockDriverState *bs, - const char *name, - Error **errp); - - /** - * Register/unregister a buffer for I/O. For example, when the driver is - * interested to know the memory areas that will later be used in iovs, so - * that it can do IOMMU mapping with VFIO etc., in order to get better - * performance. In the case of VFIO drivers, this callback is used to do - * DMA mapping for hot buffers. - */ - void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size); - void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host); - QLIST_ENTRY(BlockDriver) list; - - /* Pointer to a NULL-terminated array of names of strong options - * that can be specified for bdrv_open(). A strong option is one - * that changes the data of a BDS. - * If this pointer is NULL, the array is considered empty. - * "filename" and "driver" are always considered strong. */ - const char *const *strong_runtime_opts; -}; - -static inline bool block_driver_can_compress(BlockDriver *drv) -{ - return drv->bdrv_co_pwritev_compressed || - drv->bdrv_co_pwritev_compressed_part; -} - -typedef struct BlockLimits { - /* Alignment requirement, in bytes, for offset/length of I/O - * requests. Must be a power of 2 less than INT_MAX; defaults to - * 1 for drivers with modern byte interfaces, and to 512 - * otherwise. */ - uint32_t request_alignment; - - /* - * Maximum number of bytes that can be discarded at once. Must be multiple - * of pdiscard_alignment, but need not be power of 2. May be 0 if no - * inherent 64-bit limit. - */ - int64_t max_pdiscard; - - /* Optimal alignment for discard requests in bytes. A power of 2 - * is best but not mandatory. Must be a multiple of - * bl.request_alignment, and must be less than max_pdiscard if - * that is set. May be 0 if bl.request_alignment is good enough */ - uint32_t pdiscard_alignment; - - /* - * Maximum number of bytes that can zeroized at once. Must be multiple of - * pwrite_zeroes_alignment. 0 means no limit. - */ - int64_t max_pwrite_zeroes; - - /* Optimal alignment for write zeroes requests in bytes. A power - * of 2 is best but not mandatory. Must be a multiple of - * bl.request_alignment, and must be less than max_pwrite_zeroes - * if that is set. May be 0 if bl.request_alignment is good - * enough */ - uint32_t pwrite_zeroes_alignment; - - /* Optimal transfer length in bytes. A power of 2 is best but not - * mandatory. Must be a multiple of bl.request_alignment, or 0 if - * no preferred size */ - uint32_t opt_transfer; - - /* Maximal transfer length in bytes. Need not be power of 2, but - * must be multiple of opt_transfer and bl.request_alignment, or 0 - * for no 32-bit limit. For now, anything larger than INT_MAX is - * clamped down. */ - uint32_t max_transfer; - - /* Maximal hardware transfer length in bytes. Applies whenever - * transfers to the device bypass the kernel I/O scheduler, for - * example with SG_IO. If larger than max_transfer or if zero, - * blk_get_max_hw_transfer will fall back to max_transfer. - */ - uint64_t max_hw_transfer; - - /* Maximal number of scatter/gather elements allowed by the hardware. - * Applies whenever transfers to the device bypass the kernel I/O - * scheduler, for example with SG_IO. If larger than max_iov - * or if zero, blk_get_max_hw_iov will fall back to max_iov. - */ - int max_hw_iov; - - /* memory alignment, in bytes so that no bounce buffer is needed */ - size_t min_mem_alignment; - - /* memory alignment, in bytes, for bounce buffer */ - size_t opt_mem_alignment; - - /* maximum number of iovec elements */ - int max_iov; -} BlockLimits; - -typedef struct BdrvOpBlocker BdrvOpBlocker; - -typedef struct BdrvAioNotifier { - void (*attached_aio_context)(AioContext *new_context, void *opaque); - void (*detach_aio_context)(void *opaque); - - void *opaque; - bool deleted; - - QLIST_ENTRY(BdrvAioNotifier) list; -} BdrvAioNotifier; - -struct BdrvChildClass { - /* If true, bdrv_replace_node() doesn't change the node this BdrvChild - * points to. */ - bool stay_at_node; - - /* If true, the parent is a BlockDriverState and bdrv_next_all_states() - * will return it. This information is used for drain_all, where every node - * will be drained separately, so the drain only needs to be propagated to - * non-BDS parents. */ - bool parent_is_bds; - - void (*inherit_options)(BdrvChildRole role, bool parent_is_format, - int *child_flags, QDict *child_options, - int parent_flags, QDict *parent_options); - - void (*change_media)(BdrvChild *child, bool load); - void (*resize)(BdrvChild *child); - - /* Returns a name that is supposedly more useful for human users than the - * node name for identifying the node in question (in particular, a BB - * name), or NULL if the parent can't provide a better name. */ - const char *(*get_name)(BdrvChild *child); - - /* Returns a malloced string that describes the parent of the child for a - * human reader. This could be a node-name, BlockBackend name, qdev ID or - * QOM path of the device owning the BlockBackend, job type and ID etc. The - * caller is responsible for freeing the memory. */ - char *(*get_parent_desc)(BdrvChild *child); - - /* - * If this pair of functions is implemented, the parent doesn't issue new - * requests after returning from .drained_begin() until .drained_end() is - * called. - * - * These functions must not change the graph (and therefore also must not - * call aio_poll(), which could change the graph indirectly). - * - * If drained_end() schedules background operations, it must atomically - * increment *drained_end_counter for each such operation and atomically - * decrement it once the operation has settled. - * - * Note that this can be nested. If drained_begin() was called twice, new - * I/O is allowed only after drained_end() was called twice, too. - */ - void (*drained_begin)(BdrvChild *child); - void (*drained_end)(BdrvChild *child, int *drained_end_counter); - - /* - * Returns whether the parent has pending requests for the child. This - * callback is polled after .drained_begin() has been called until all - * activity on the child has stopped. - */ - bool (*drained_poll)(BdrvChild *child); - - /* Notifies the parent that the child has been activated/inactivated (e.g. - * when migration is completing) and it can start/stop requesting - * permissions and doing I/O on it. */ - void (*activate)(BdrvChild *child, Error **errp); - int (*inactivate)(BdrvChild *child); - - void (*attach)(BdrvChild *child); - void (*detach)(BdrvChild *child); - - /* Notifies the parent that the filename of its child has changed (e.g. - * because the direct child was removed from the backing chain), so that it - * can update its reference. */ - int (*update_filename)(BdrvChild *child, BlockDriverState *new_base, - const char *filename, Error **errp); - - bool (*can_set_aio_ctx)(BdrvChild *child, AioContext *ctx, - GSList **ignore, Error **errp); - void (*set_aio_ctx)(BdrvChild *child, AioContext *ctx, GSList **ignore); - - AioContext *(*get_parent_aio_context)(BdrvChild *child); -}; - -extern const BdrvChildClass child_of_bds; - -struct BdrvChild { - BlockDriverState *bs; - char *name; - const BdrvChildClass *klass; - BdrvChildRole role; - void *opaque; - - /** - * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask) - */ - uint64_t perm; - - /** - * Permissions that can still be granted to other users of @bs while this - * BdrvChild is still attached to it. (BLK_PERM_* bitmask) - */ - uint64_t shared_perm; - - /* - * This link is frozen: the child can neither be replaced nor - * detached from the parent. - */ - bool frozen; - - /* - * How many times the parent of this child has been drained - * (through klass->drained_*). - * Usually, this is equal to bs->quiesce_counter (potentially - * reduced by bdrv_drain_all_count). It may differ while the - * child is entering or leaving a drained section. - */ - int parent_quiesce_counter; - - QLIST_ENTRY(BdrvChild) next; - QLIST_ENTRY(BdrvChild) next_parent; -}; - -/* - * Allows bdrv_co_block_status() to cache one data region for a - * protocol node. - * - * @valid: Whether the cache is valid (should be accessed with atomic - * functions so this can be reset by RCU readers) - * @data_start: Offset where we know (or strongly assume) is data - * @data_end: Offset where the data region ends (which is not necessarily - * the start of a zeroed region) - */ -typedef struct BdrvBlockStatusCache { - struct rcu_head rcu; - - bool valid; - int64_t data_start; - int64_t data_end; -} BdrvBlockStatusCache; - -struct BlockDriverState { - /* Protected by big QEMU lock or read-only after opening. No special - * locking needed during I/O... - */ - int open_flags; /* flags used to open the file, re-used for re-open */ - bool encrypted; /* if true, the media is encrypted */ - bool sg; /* if true, the device is a /dev/sg* */ - bool probed; /* if true, format was probed rather than specified */ - bool force_share; /* if true, always allow all shared permissions */ - bool implicit; /* if true, this filter node was automatically inserted */ - - BlockDriver *drv; /* NULL means no media */ - void *opaque; - - AioContext *aio_context; /* event loop used for fd handlers, timers, etc */ - /* long-running tasks intended to always use the same AioContext as this - * BDS may register themselves in this list to be notified of changes - * regarding this BDS's context */ - QLIST_HEAD(, BdrvAioNotifier) aio_notifiers; - bool walking_aio_notifiers; /* to make removal during iteration safe */ - - char filename[PATH_MAX]; - /* - * If not empty, this image is a diff in relation to backing_file. - * Note that this is the name given in the image header and - * therefore may or may not be equal to .backing->bs->filename. - * If this field contains a relative path, it is to be resolved - * relatively to the overlay's location. - */ - char backing_file[PATH_MAX]; - /* - * The backing filename indicated by the image header. Contrary - * to backing_file, if we ever open this file, auto_backing_file - * is replaced by the resulting BDS's filename (i.e. after a - * bdrv_refresh_filename() run). - */ - char auto_backing_file[PATH_MAX]; - char backing_format[16]; /* if non-zero and backing_file exists */ - - QDict *full_open_options; - char exact_filename[PATH_MAX]; - - BdrvChild *backing; - BdrvChild *file; - - /* I/O Limits */ - BlockLimits bl; - - /* - * Flags honored during pread - */ - unsigned int supported_read_flags; - /* Flags honored during pwrite (so far: BDRV_REQ_FUA, - * BDRV_REQ_WRITE_UNCHANGED). - * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those - * writes will be issued as normal writes without the flag set. - * This is important to note for drivers that do not explicitly - * request a WRITE permission for their children and instead take - * the same permissions as their parent did (this is commonly what - * block filters do). Such drivers have to be aware that the - * parent may have taken a WRITE_UNCHANGED permission only and is - * issuing such requests. Drivers either must make sure that - * these requests do not result in plain WRITE accesses (usually - * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding - * every incoming write request as-is, including potentially that - * flag), or they have to explicitly take the WRITE permission for - * their children. */ - unsigned int supported_write_flags; - /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, - * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */ - unsigned int supported_zero_flags; - /* - * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE). - * - * If BDRV_REQ_ZERO_WRITE is given, the truncate operation must make sure - * that any added space reads as all zeros. If this can't be guaranteed, - * the operation must fail. - */ - unsigned int supported_truncate_flags; - - /* the following member gives a name to every node on the bs graph. */ - char node_name[32]; - /* element of the list of named nodes building the graph */ - QTAILQ_ENTRY(BlockDriverState) node_list; - /* element of the list of all BlockDriverStates (all_bdrv_states) */ - QTAILQ_ENTRY(BlockDriverState) bs_list; - /* element of the list of monitor-owned BDS */ - QTAILQ_ENTRY(BlockDriverState) monitor_list; - int refcnt; - - /* operation blockers */ - QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX]; - - /* The node that this node inherited default options from (and a reopen on - * which can affect this node by changing these defaults). This is always a - * parent node of this node. */ - BlockDriverState *inherits_from; - QLIST_HEAD(, BdrvChild) children; - QLIST_HEAD(, BdrvChild) parents; - - QDict *options; - QDict *explicit_options; - BlockdevDetectZeroesOptions detect_zeroes; - - /* The error object in use for blocking operations on backing_hd */ - Error *backing_blocker; - - /* Protected by AioContext lock */ - - /* If we are reading a disk image, give its size in sectors. - * Generally read-only; it is written to by load_snapshot and - * save_snaphost, but the block layer is quiescent during those. - */ - int64_t total_sectors; - - /* threshold limit for writes, in bytes. "High water mark". */ - uint64_t write_threshold_offset; - - /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex. - * Reading from the list can be done with either the BQL or the - * dirty_bitmap_mutex. Modifying a bitmap only requires - * dirty_bitmap_mutex. */ - QemuMutex dirty_bitmap_mutex; - QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; - - /* Offset after the highest byte written to */ - Stat64 wr_highest_offset; - - /* If true, copy read backing sectors into image. Can be >1 if more - * than one client has requested copy-on-read. Accessed with atomic - * ops. - */ - int copy_on_read; - - /* number of in-flight requests; overall and serialising. - * Accessed with atomic ops. - */ - unsigned int in_flight; - unsigned int serialising_in_flight; - - /* counter for nested bdrv_io_plug. - * Accessed with atomic ops. - */ - unsigned io_plugged; - - /* do we need to tell the quest if we have a volatile write cache? */ - int enable_write_cache; - - /* Accessed with atomic ops. */ - int quiesce_counter; - int recursive_quiesce_counter; - - unsigned int write_gen; /* Current data generation */ - - /* Protected by reqs_lock. */ - CoMutex reqs_lock; - QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; - CoQueue flush_queue; /* Serializing flush queue */ - bool active_flush_req; /* Flush request in flight? */ - - /* Only read/written by whoever has set active_flush_req to true. */ - unsigned int flushed_gen; /* Flushed write generation */ - - /* BdrvChild links to this node may never be frozen */ - bool never_freeze; - - /* Lock for block-status cache RCU writers */ - CoMutex bsc_modify_lock; - /* Always non-NULL, but must only be dereferenced under an RCU read guard */ - BdrvBlockStatusCache *block_status_cache; -}; - -struct BlockBackendRootState { - int open_flags; - BlockdevDetectZeroesOptions detect_zeroes; -}; - -typedef enum BlockMirrorBackingMode { - /* Reuse the existing backing chain from the source for the target. - * - sync=full: Set backing BDS to NULL. - * - sync=top: Use source's backing BDS. - * - sync=none: Use source as the backing BDS. */ - MIRROR_SOURCE_BACKING_CHAIN, - - /* Open the target's backing chain completely anew */ - MIRROR_OPEN_BACKING_CHAIN, - - /* Do not change the target's backing BDS after job completion */ - MIRROR_LEAVE_BACKING_CHAIN, -} BlockMirrorBackingMode; - - -/* Essential block drivers which must always be statically linked into qemu, and - * which therefore can be accessed without using bdrv_find_format() */ -extern BlockDriver bdrv_file; -extern BlockDriver bdrv_raw; -extern BlockDriver bdrv_qcow2; - -int coroutine_fn bdrv_co_preadv(BdrvChild *child, - int64_t offset, int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, - int64_t offset, int64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); -int coroutine_fn bdrv_co_pwritev(BdrvChild *child, - int64_t offset, int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, - int64_t offset, int64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); - -static inline int coroutine_fn bdrv_co_pread(BdrvChild *child, - int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags) -{ - QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); - - return bdrv_co_preadv(child, offset, bytes, &qiov, flags); -} - -static inline int coroutine_fn bdrv_co_pwrite(BdrvChild *child, - int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags) -{ - QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); - - return bdrv_co_pwritev(child, offset, bytes, &qiov, flags); -} - -extern unsigned int bdrv_drain_all_count; -void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent); -void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent); - -bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, - uint64_t align); -BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs); - -int get_tmp_filename(char *filename, int size); -BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, - const char *filename); - -void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix, - QDict *options); - -/** - * bdrv_add_aio_context_notifier: - * - * If a long-running job intends to be always run in the same AioContext as a - * certain BDS, it may use this function to be notified of changes regarding the - * association of the BDS to an AioContext. - * - * attached_aio_context() is called after the target BDS has been attached to a - * new AioContext; detach_aio_context() is called before the target BDS is being - * detached from its old AioContext. - */ -void bdrv_add_aio_context_notifier(BlockDriverState *bs, - void (*attached_aio_context)(AioContext *new_context, void *opaque), - void (*detach_aio_context)(void *opaque), void *opaque); - -/** - * bdrv_remove_aio_context_notifier: - * - * Unsubscribe of change notifications regarding the BDS's AioContext. The - * parameters given here have to be the same as those given to - * bdrv_add_aio_context_notifier(). - */ -void bdrv_remove_aio_context_notifier(BlockDriverState *bs, - void (*aio_context_attached)(AioContext *, - void *), - void (*aio_context_detached)(void *), - void *opaque); - -/** - * bdrv_wakeup: - * @bs: The BlockDriverState for which an I/O operation has been completed. - * - * Wake up the main thread if it is waiting on BDRV_POLL_WHILE. During - * synchronous I/O on a BlockDriverState that is attached to another - * I/O thread, the main thread lets the I/O thread's event loop run, - * waiting for the I/O operation to complete. A bdrv_wakeup will wake - * up the main thread if necessary. - * - * Manual calls to bdrv_wakeup are rarely necessary, because - * bdrv_dec_in_flight already calls it. - */ -void bdrv_wakeup(BlockDriverState *bs); - -#ifdef _WIN32 -int is_windows_drive(const char *filename); -#endif - -/** - * stream_start: - * @job_id: The id of the newly-created job, or %NULL to use the - * device name of @bs. - * @bs: Block device to operate on. - * @base: Block device that will become the new base, or %NULL to - * flatten the whole backing file chain onto @bs. - * @backing_file_str: The file name that will be written to @bs as the - * the new backing file if the job completes. Ignored if @base is %NULL. - * @creation_flags: Flags that control the behavior of the Job lifetime. - * See @BlockJobCreateFlags - * @speed: The maximum speed, in bytes per second, or 0 for unlimited. - * @on_error: The action to take upon error. - * @filter_node_name: The node name that should be assigned to the filter - * driver that the stream job inserts into the graph above - * @bs. NULL means that a node name should be autogenerated. - * @errp: Error object. - * - * Start a streaming operation on @bs. Clusters that are unallocated - * in @bs, but allocated in any image between @base and @bs (both - * exclusive) will be written to @bs. At the end of a successful - * streaming job, the backing file of @bs will be changed to - * @backing_file_str in the written image and to @base in the live - * BlockDriverState. - */ -void stream_start(const char *job_id, BlockDriverState *bs, - BlockDriverState *base, const char *backing_file_str, - BlockDriverState *bottom, - int creation_flags, int64_t speed, - BlockdevOnError on_error, - const char *filter_node_name, - Error **errp); - -/** - * commit_start: - * @job_id: The id of the newly-created job, or %NULL to use the - * device name of @bs. - * @bs: Active block device. - * @top: Top block device to be committed. - * @base: Block device that will be written into, and become the new top. - * @creation_flags: Flags that control the behavior of the Job lifetime. - * See @BlockJobCreateFlags - * @speed: The maximum speed, in bytes per second, or 0 for unlimited. - * @on_error: The action to take upon error. - * @backing_file_str: String to use as the backing file in @top's overlay - * @filter_node_name: The node name that should be assigned to the filter - * driver that the commit job inserts into the graph above @top. NULL means - * that a node name should be autogenerated. - * @errp: Error object. - * - */ -void commit_start(const char *job_id, BlockDriverState *bs, - BlockDriverState *base, BlockDriverState *top, - int creation_flags, int64_t speed, - BlockdevOnError on_error, const char *backing_file_str, - const char *filter_node_name, Error **errp); -/** - * commit_active_start: - * @job_id: The id of the newly-created job, or %NULL to use the - * device name of @bs. - * @bs: Active block device to be committed. - * @base: Block device that will be written into, and become the new top. - * @creation_flags: Flags that control the behavior of the Job lifetime. - * See @BlockJobCreateFlags - * @speed: The maximum speed, in bytes per second, or 0 for unlimited. - * @on_error: The action to take upon error. - * @filter_node_name: The node name that should be assigned to the filter - * driver that the commit job inserts into the graph above @bs. NULL means that - * a node name should be autogenerated. - * @cb: Completion function for the job. - * @opaque: Opaque pointer value passed to @cb. - * @auto_complete: Auto complete the job. - * @errp: Error object. - * - */ -BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, - BlockDriverState *base, int creation_flags, - int64_t speed, BlockdevOnError on_error, - const char *filter_node_name, - BlockCompletionFunc *cb, void *opaque, - bool auto_complete, Error **errp); -/* - * mirror_start: - * @job_id: The id of the newly-created job, or %NULL to use the - * device name of @bs. - * @bs: Block device to operate on. - * @target: Block device to write to. - * @replaces: Block graph node name to replace once the mirror is done. Can - * only be used when full mirroring is selected. - * @creation_flags: Flags that control the behavior of the Job lifetime. - * See @BlockJobCreateFlags - * @speed: The maximum speed, in bytes per second, or 0 for unlimited. - * @granularity: The chosen granularity for the dirty bitmap. - * @buf_size: The amount of data that can be in flight at one time. - * @mode: Whether to collapse all images in the chain to the target. - * @backing_mode: How to establish the target's backing chain after completion. - * @zero_target: Whether the target should be explicitly zero-initialized - * @on_source_error: The action to take upon error reading from the source. - * @on_target_error: The action to take upon error writing to the target. - * @unmap: Whether to unmap target where source sectors only contain zeroes. - * @filter_node_name: The node name that should be assigned to the filter - * driver that the mirror job inserts into the graph above @bs. NULL means that - * a node name should be autogenerated. - * @copy_mode: When to trigger writes to the target. - * @errp: Error object. - * - * Start a mirroring operation on @bs. Clusters that are allocated - * in @bs will be written to @target until the job is cancelled or - * manually completed. At the end of a successful mirroring job, - * @bs will be switched to read from @target. - */ -void mirror_start(const char *job_id, BlockDriverState *bs, - BlockDriverState *target, const char *replaces, - int creation_flags, int64_t speed, - uint32_t granularity, int64_t buf_size, - MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, - bool zero_target, - BlockdevOnError on_source_error, - BlockdevOnError on_target_error, - bool unmap, const char *filter_node_name, - MirrorCopyMode copy_mode, Error **errp); - -/* - * backup_job_create: - * @job_id: The id of the newly-created job, or %NULL to use the - * device name of @bs. - * @bs: Block device to operate on. - * @target: Block device to write to. - * @speed: The maximum speed, in bytes per second, or 0 for unlimited. - * @sync_mode: What parts of the disk image should be copied to the destination. - * @sync_bitmap: The dirty bitmap if sync_mode is 'bitmap' or 'incremental' - * @bitmap_mode: The bitmap synchronization policy to use. - * @perf: Performance options. All actual fields assumed to be present, - * all ".has_*" fields are ignored. - * @on_source_error: The action to take upon error reading from the source. - * @on_target_error: The action to take upon error writing to the target. - * @creation_flags: Flags that control the behavior of the Job lifetime. - * See @BlockJobCreateFlags - * @cb: Completion function for the job. - * @opaque: Opaque pointer value passed to @cb. - * @txn: Transaction that this job is part of (may be NULL). - * - * Create a backup operation on @bs. Clusters in @bs are written to @target - * until the job is cancelled or manually completed. - */ -BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, - BlockDriverState *target, int64_t speed, - MirrorSyncMode sync_mode, - BdrvDirtyBitmap *sync_bitmap, - BitmapSyncMode bitmap_mode, - bool compress, - const char *filter_node_name, - BackupPerf *perf, - BlockdevOnError on_source_error, - BlockdevOnError on_target_error, - int creation_flags, - BlockCompletionFunc *cb, void *opaque, - JobTxn *txn, Error **errp); - -BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, - const char *child_name, - const BdrvChildClass *child_class, - BdrvChildRole child_role, - uint64_t perm, uint64_t shared_perm, - void *opaque, Error **errp); -void bdrv_root_unref_child(BdrvChild *child); - -void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, - uint64_t *shared_perm); - -/** - * Sets a BdrvChild's permissions. Avoid if the parent is a BDS; use - * bdrv_child_refresh_perms() instead and make the parent's - * .bdrv_child_perm() implementation return the correct values. - */ -int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared, - Error **errp); - -/** - * Calls bs->drv->bdrv_child_perm() and updates the child's permission - * masks with the result. - * Drivers should invoke this function whenever an event occurs that - * makes their .bdrv_child_perm() implementation return different - * values than before, but which will not result in the block layer - * automatically refreshing the permissions. - */ -int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp); - -bool bdrv_recurse_can_replace(BlockDriverState *bs, - BlockDriverState *to_replace); - -/* - * Default implementation for BlockDriver.bdrv_child_perm() that can - * be used by block filters and image formats, as long as they use the - * child_of_bds child class and set an appropriate BdrvChildRole. - */ -void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c, - BdrvChildRole role, BlockReopenQueue *reopen_queue, - uint64_t perm, uint64_t shared, - uint64_t *nperm, uint64_t *nshared); - -const char *bdrv_get_parent_name(const BlockDriverState *bs); -void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp); -bool blk_dev_has_removable_media(BlockBackend *blk); -bool blk_dev_has_tray(BlockBackend *blk); -void blk_dev_eject_request(BlockBackend *blk, bool force); -bool blk_dev_is_tray_open(BlockBackend *blk); -bool blk_dev_is_medium_locked(BlockBackend *blk); - -void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes); - -void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out); -void bdrv_restore_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *backup); -bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest, - const BdrvDirtyBitmap *src, - HBitmap **backup, bool lock); - -void bdrv_inc_in_flight(BlockDriverState *bs); -void bdrv_dec_in_flight(BlockDriverState *bs); - -void blockdev_close_all_bdrv_states(void); - -int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, - BdrvChild *dst, int64_t dst_offset, - int64_t bytes, - BdrvRequestFlags read_flags, - BdrvRequestFlags write_flags); -int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, - BdrvChild *dst, int64_t dst_offset, - int64_t bytes, - BdrvRequestFlags read_flags, - BdrvRequestFlags write_flags); - -int refresh_total_sectors(BlockDriverState *bs, int64_t hint); - -void bdrv_set_monitor_owned(BlockDriverState *bs); -BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp); - -/** - * Simple implementation of bdrv_co_create_opts for protocol drivers - * which only support creation via opening a file - * (usually existing raw storage device) - */ -int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, - const char *filename, - QemuOpts *opts, - Error **errp); -extern QemuOptsList bdrv_create_opts_simple; - -BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, - const char *name, - BlockDriverState **pbs, - Error **errp); -BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, - BlockDirtyBitmapMergeSourceList *bms, - HBitmap **backup, Error **errp); -BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, - bool release, - BlockDriverState **bitmap_bs, - Error **errp); - -BdrvChild *bdrv_cow_child(BlockDriverState *bs); -BdrvChild *bdrv_filter_child(BlockDriverState *bs); -BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs); -BdrvChild *bdrv_primary_child(BlockDriverState *bs); -BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs); -BlockDriverState *bdrv_skip_filters(BlockDriverState *bs); -BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs); - -static inline BlockDriverState *child_bs(BdrvChild *child) -{ - return child ? child->bs : NULL; -} - -static inline BlockDriverState *bdrv_cow_bs(BlockDriverState *bs) -{ - return child_bs(bdrv_cow_child(bs)); -} - -static inline BlockDriverState *bdrv_filter_bs(BlockDriverState *bs) -{ - return child_bs(bdrv_filter_child(bs)); -} - -static inline BlockDriverState *bdrv_filter_or_cow_bs(BlockDriverState *bs) -{ - return child_bs(bdrv_filter_or_cow_child(bs)); -} - -static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs) -{ - return child_bs(bdrv_primary_child(bs)); -} - -/** - * End all quiescent sections started by bdrv_drain_all_begin(). This is - * needed when deleting a BDS before bdrv_drain_all_end() is called. - * - * NOTE: this is an internal helper for bdrv_close() *only*. No one else - * should call it. - */ -void bdrv_drain_all_end_quiesce(BlockDriverState *bs); - -/** - * Check whether the given offset is in the cached block-status data - * region. - * - * If it is, and @pnum is not NULL, *pnum is set to - * `bsc.data_end - offset`, i.e. how many bytes, starting from - * @offset, are data (according to the cache). - * Otherwise, *pnum is not touched. - */ -bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum); - -/** - * If [offset, offset + bytes) overlaps with the currently cached - * block-status region, invalidate the cache. - * - * (To be used by I/O paths that cause data regions to be zero or - * holes.) - */ -void bdrv_bsc_invalidate_range(BlockDriverState *bs, - int64_t offset, int64_t bytes); - -/** - * Mark the range [offset, offset + bytes) as a data region. - */ -void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes); +/* DO NOT ADD ANYTHING IN HERE. USE ONE OF THE HEADERS INCLUDED ABOVE */ #endif /* BLOCK_INT_H */ diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 87fbb39..6525e16 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -74,6 +74,13 @@ typedef struct BlockJob { GSList *nodes; } BlockJob; +/* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + /** * block_job_next: * @job: A block job, or %NULL. @@ -155,6 +162,21 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp); */ void block_job_iostatus_reset(BlockJob *job); +/* + * block_job_get_aio_context: + * + * Returns aio context associated with a block job. + */ +AioContext *block_job_get_aio_context(BlockJob *job); + + +/* + * Common functions that are neither I/O nor Global State. + * + * See include/block/block-common.h for more information about + * the Common API. + */ + /** * block_job_is_internal: * @job: The job to determine if it is user-visible or not. @@ -170,11 +192,4 @@ bool block_job_is_internal(BlockJob *job); */ const BlockJobDriver *block_job_driver(BlockJob *job); -/* - * block_job_get_aio_context: - * - * Returns aio context associated with a block job. - */ -AioContext *block_job_get_aio_context(BlockJob *job); - #endif diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h index 6633d83..6bd9ae2 100644 --- a/include/block/blockjob_int.h +++ b/include/block/blockjob_int.h @@ -39,6 +39,13 @@ struct BlockJobDriver { JobDriver job_driver; /* + * I/O API functions. These functions are thread-safe. + * + * See include/block/block-io.h for more information about + * the I/O API. + */ + + /* * Returns whether the job has pending requests for the child or will * submit new requests before the next pause point. This callback is polled * in the context of draining a job node after requesting that the job be @@ -47,6 +54,13 @@ struct BlockJobDriver { bool (*drained_poll)(BlockJob *job); /* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + + /* * If the callback is not NULL, it will be invoked before the job is * resumed in a new AioContext. This is the place to move any resources * besides job->blk to the new AioContext. @@ -56,6 +70,13 @@ struct BlockJobDriver { void (*set_speed)(BlockJob *job, int64_t speed); }; +/* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + /** * block_job_create: * @job_id: The id of the newly-created job, or %NULL to have one @@ -98,6 +119,13 @@ void block_job_free(Job *job); */ void block_job_user_resume(Job *job); +/* + * I/O API functions. These functions are thread-safe. + * + * See include/block/block-io.h for more information about + * the I/O API. + */ + /** * block_job_ratelimit_get_delay: * diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h index 40950ae..6528336 100644 --- a/include/block/dirty-bitmap.h +++ b/include/block/dirty-bitmap.h @@ -77,7 +77,7 @@ void bdrv_dirty_bitmap_set_persistence(BdrvDirtyBitmap *bitmap, bool persistent); void bdrv_dirty_bitmap_set_inconsistent(BdrvDirtyBitmap *bitmap); void bdrv_dirty_bitmap_set_busy(BdrvDirtyBitmap *bitmap, bool busy); -void bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src, +bool bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src, HBitmap **backup, Error **errp); void bdrv_dirty_bitmap_skip_store(BdrvDirtyBitmap *bitmap, bool skip); bool bdrv_dirty_bitmap_get(BdrvDirtyBitmap *bitmap, int64_t offset); @@ -115,6 +115,8 @@ int64_t bdrv_dirty_bitmap_next_zero(BdrvDirtyBitmap *bitmap, int64_t offset, bool bdrv_dirty_bitmap_next_dirty_area(BdrvDirtyBitmap *bitmap, int64_t start, int64_t end, int64_t max_dirty_count, int64_t *dirty_start, int64_t *dirty_count); +bool bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap, int64_t offset, + int64_t bytes, int64_t *count); BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, Error **errp); diff --git a/include/block/nvme.h b/include/block/nvme.h index cd068ac..3737351 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -695,7 +695,8 @@ typedef struct QEMU_PACKED NvmeRwCmd { uint8_t flags; uint16_t cid; uint32_t nsid; - uint64_t rsvd2; + uint32_t cdw2; + uint32_t cdw3; uint64_t mptr; NvmeCmdDptr dptr; uint64_t slba; @@ -731,7 +732,6 @@ enum { NVME_RW_PRINFO_PRCHK_APP = 1 << 11, NVME_RW_PRINFO_PRCHK_REF = 1 << 10, NVME_RW_PRINFO_PRCHK_MASK = 7 << 10, - }; #define NVME_RW_PRINFO(control) ((control >> 10) & 0xf) @@ -770,6 +770,7 @@ typedef struct QEMU_PACKED NvmeDsmRange { enum { NVME_COPY_FORMAT_0 = 0x0, + NVME_COPY_FORMAT_1 = 0x1, }; typedef struct QEMU_PACKED NvmeCopyCmd { @@ -777,7 +778,9 @@ typedef struct QEMU_PACKED NvmeCopyCmd { uint8_t flags; uint16_t cid; uint32_t nsid; - uint32_t rsvd2[4]; + uint32_t cdw2; + uint32_t cdw3; + uint32_t rsvd2[2]; NvmeCmdDptr dptr; uint64_t sdlba; uint8_t nr; @@ -789,7 +792,7 @@ typedef struct QEMU_PACKED NvmeCopyCmd { uint16_t appmask; } NvmeCopyCmd; -typedef struct QEMU_PACKED NvmeCopySourceRange { +typedef struct QEMU_PACKED NvmeCopySourceRangeFormat0 { uint8_t rsvd0[8]; uint64_t slba; uint16_t nlb; @@ -797,7 +800,17 @@ typedef struct QEMU_PACKED NvmeCopySourceRange { uint32_t reftag; uint16_t apptag; uint16_t appmask; -} NvmeCopySourceRange; +} NvmeCopySourceRangeFormat0; + +typedef struct QEMU_PACKED NvmeCopySourceRangeFormat1 { + uint8_t rsvd0[8]; + uint64_t slba; + uint16_t nlb; + uint8_t rsvd18[8]; + uint8_t sr[10]; + uint16_t apptag; + uint16_t appmask; +} NvmeCopySourceRangeFormat1; enum NvmeAsyncEventRequest { NVME_AER_TYPE_ERROR = 0, @@ -908,6 +921,7 @@ enum NvmeStatusCodes { NVME_CMP_FAILURE = 0x0285, NVME_ACCESS_DENIED = 0x0286, NVME_DULB = 0x0287, + NVME_E2E_STORAGE_TAG_ERROR = 0x0288, NVME_MORE = 0x2000, NVME_DNR = 0x4000, NVME_NO_COMPLETE = 0xffff, @@ -1111,6 +1125,10 @@ enum NvmeIdCtrlOaes { NVME_OAES_NS_ATTR = 1 << 8, }; +enum NvmeIdCtrlCtratt { + NVME_CTRATT_ELBAS = 1 << 15, +}; + enum NvmeIdCtrlOacs { NVME_OACS_SECURITY = 1 << 0, NVME_OACS_FORMAT = 1 << 1, @@ -1131,7 +1149,8 @@ enum NvmeIdCtrlOncs { }; enum NvmeIdCtrlOcfs { - NVME_OCFS_COPY_FORMAT_0 = 1 << 0, + NVME_OCFS_COPY_FORMAT_0 = 1 << NVME_COPY_FORMAT_0, + NVME_OCFS_COPY_FORMAT_1 = 1 << NVME_COPY_FORMAT_1, }; enum NvmeIdctrlVwc { @@ -1216,6 +1235,7 @@ enum NvmeFeatureIds { NVME_WRITE_ATOMICITY = 0xa, NVME_ASYNCHRONOUS_EVENT_CONF = 0xb, NVME_TIMESTAMP = 0xe, + NVME_HOST_BEHAVIOR_SUPPORT = 0x16, NVME_COMMAND_SET_PROFILE = 0x19, NVME_SOFTWARE_PROGRESS_MARKER = 0x80, NVME_FID_MAX = 0x100, @@ -1257,6 +1277,13 @@ typedef struct QEMU_PACKED NvmeRangeType { uint8_t rsvd48[16]; } NvmeRangeType; +typedef struct NvmeHostBehaviorSupport { + uint8_t acre; + uint8_t etdas; + uint8_t lbafee; + uint8_t rsvd3[509]; +} NvmeHostBehaviorSupport; + typedef struct QEMU_PACKED NvmeLBAF { uint16_t ms; uint8_t ds; @@ -1270,6 +1297,7 @@ typedef struct QEMU_PACKED NvmeLBAFE { } NvmeLBAFE; #define NVME_NSID_BROADCAST 0xffffffff +#define NVME_MAX_NLBAF 64 typedef struct QEMU_PACKED NvmeIdNs { uint64_t nsze; @@ -1304,11 +1332,20 @@ typedef struct QEMU_PACKED NvmeIdNs { uint8_t rsvd81[23]; uint8_t nguid[16]; uint64_t eui64; - NvmeLBAF lbaf[16]; - uint8_t rsvd192[192]; + NvmeLBAF lbaf[NVME_MAX_NLBAF]; uint8_t vs[3712]; } NvmeIdNs; +#define NVME_ID_NS_NVM_ELBAF_PIF(elbaf) (((elbaf) >> 7) & 0x3) + +typedef struct QEMU_PACKED NvmeIdNsNvm { + uint64_t lbstm; + uint8_t pic; + uint8_t rsvd9[3]; + uint32_t elbaf[NVME_MAX_NLBAF]; + uint8_t rsvd268[3828]; +} NvmeIdNsNvm; + typedef struct QEMU_PACKED NvmeIdNsDescr { uint8_t nidt; uint8_t nidl; @@ -1410,10 +1447,23 @@ enum NvmeIdNsMc { #define NVME_ID_NS_DPS_TYPE(dps) (dps & NVME_ID_NS_DPS_TYPE_MASK) -typedef struct NvmeDifTuple { - uint16_t guard; - uint16_t apptag; - uint32_t reftag; +enum NvmePIFormat { + NVME_PI_GUARD_16 = 0, + NVME_PI_GUARD_64 = 2, +}; + +typedef union NvmeDifTuple { + struct { + uint16_t guard; + uint16_t apptag; + uint32_t reftag; + } g16; + + struct { + uint64_t guard; + uint16_t apptag; + uint8_t sr[6]; + } g64; } NvmeDifTuple; enum NvmeZoneAttr { @@ -1510,7 +1560,8 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeZonedResult) != 8); QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); - QEMU_BUILD_BUG_ON(sizeof(NvmeCopySourceRange) != 32); + QEMU_BUILD_BUG_ON(sizeof(NvmeCopySourceRangeFormat0) != 32); + QEMU_BUILD_BUG_ON(sizeof(NvmeCopySourceRangeFormat1) != 40); QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64); @@ -1520,6 +1571,7 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeHostBehaviorSupport) != 512); QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); @@ -1530,10 +1582,11 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4); QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsNvm) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4); QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeDifTuple) != 8); + QEMU_BUILD_BUG_ON(sizeof(NvmeDifTuple) != 16); } #endif diff --git a/include/block/reqlist.h b/include/block/reqlist.h new file mode 100644 index 0000000..5253497 --- /dev/null +++ b/include/block/reqlist.h @@ -0,0 +1,75 @@ +/* + * reqlist API + * + * Copyright (C) 2013 Proxmox Server Solutions + * Copyright (c) 2021 Virtuozzo International GmbH. + * + * Authors: + * Dietmar Maurer (dietmar@proxmox.com) + * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef REQLIST_H +#define REQLIST_H + +#include "qemu/coroutine.h" + +/* + * The API is not thread-safe and shouldn't be. The struct is public to be part + * of other structures and protected by third-party locks, see + * block/block-copy.c for example. + */ + +typedef struct BlockReq { + int64_t offset; + int64_t bytes; + + CoQueue wait_queue; /* coroutines blocked on this req */ + QLIST_ENTRY(BlockReq) list; +} BlockReq; + +typedef QLIST_HEAD(, BlockReq) BlockReqList; + +/* + * Initialize new request and add it to the list. Caller must be sure that + * there are no conflicting requests in the list. + */ +void reqlist_init_req(BlockReqList *reqs, BlockReq *req, int64_t offset, + int64_t bytes); +/* Search for request in the list intersecting with @offset/@bytes area. */ +BlockReq *reqlist_find_conflict(BlockReqList *reqs, int64_t offset, + int64_t bytes); + +/* + * If there are no intersecting requests return false. Otherwise, wait for the + * first found intersecting request to finish and return true. + * + * @lock is passed to qemu_co_queue_wait() + * False return value proves that lock was released at no point. + */ +bool coroutine_fn reqlist_wait_one(BlockReqList *reqs, int64_t offset, + int64_t bytes, CoMutex *lock); + +/* + * Wait for all intersecting requests. It just calls reqlist_wait_one() in a + * loop, caller is responsible to stop producing new requests in this region + * in parallel, otherwise reqlist_wait_all() may never return. + */ +void coroutine_fn reqlist_wait_all(BlockReqList *reqs, int64_t offset, + int64_t bytes, CoMutex *lock); + +/* + * Shrink request and wake all waiting coroutines (maybe some of them are not + * intersecting with shrunk request). + */ +void coroutine_fn reqlist_shrink_req(BlockReq *req, int64_t new_bytes); + +/* + * Remove request and wake all waiting coroutines. Do not release any memory. + */ +void coroutine_fn reqlist_remove_req(BlockReq *req); + +#endif /* REQLIST_H */ diff --git a/include/block/snapshot.h b/include/block/snapshot.h index 9403456..50ff924 100644 --- a/include/block/snapshot.h +++ b/include/block/snapshot.h @@ -45,6 +45,13 @@ typedef struct QEMUSnapshotInfo { uint64_t icount; /* record/replay step */ } QEMUSnapshotInfo; +/* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, const char *name); bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs, @@ -73,9 +80,11 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, Error **errp); -/* Group operations. All block drivers are involved. +/* + * Group operations. All block drivers are involved. * These functions will properly handle dataplane (take aio_context_acquire - * when appropriate for appropriate block drivers */ + * when appropriate for appropriate block drivers + */ bool bdrv_all_can_snapshot(bool has_devices, strList *devices, Error **errp); diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index 84caf5c..c0f0fab 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -433,10 +433,6 @@ int cpu_exec(CPUState *cpu); void tcg_exec_realizefn(CPUState *cpu, Error **errp); void tcg_exec_unrealizefn(CPUState *cpu); -/* Returns: 0 on success, -1 on error */ -int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, - void *ptr, target_ulong len, bool is_write); - /** * cpu_set_cpustate_pointers(cpu) * @cpu: The cpu object diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index de5f444..7f7b594 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -7,6 +7,18 @@ #include "exec/hwaddr.h" #endif +/** + * vaddr: + * Type wide enough to contain any #target_ulong virtual address. + */ +typedef uint64_t vaddr; +#define VADDR_PRId PRId64 +#define VADDR_PRIu PRIu64 +#define VADDR_PRIo PRIo64 +#define VADDR_PRIx PRIx64 +#define VADDR_PRIX PRIX64 +#define VADDR_MAX UINT64_MAX + /* Using intptr_t ensures that qemu_*_page_mask is sign-extended even * when intptr_t is 32-bit and we are aligning a long long. */ @@ -78,6 +90,28 @@ void qemu_ram_unset_migratable(RAMBlock *rb); size_t qemu_ram_pagesize(RAMBlock *block); size_t qemu_ram_pagesize_largest(void); +/** + * cpu_address_space_init: + * @cpu: CPU to add this address space to + * @asidx: integer index of this address space + * @prefix: prefix to be used as name of address space + * @mr: the root memory region of address space + * + * Add the specified address space to the CPU's cpu_ases list. + * The address space added with @asidx 0 is the one used for the + * convenience pointer cpu->as. + * The target-specific code which registers ASes is responsible + * for defining what semantics address space 0, 1, 2, etc have. + * + * Before the first call to this function, the caller must set + * cpu->num_ases to the total number of address spaces it needs + * to support. + * + * Note that with KVM only one address space is supported. + */ +void cpu_address_space_init(CPUState *cpu, int asidx, + const char *prefix, MemoryRegion *mr); + void cpu_physical_memory_rw(hwaddr addr, void *buf, hwaddr len, bool is_write); static inline void cpu_physical_memory_read(hwaddr addr, @@ -90,6 +124,7 @@ static inline void cpu_physical_memory_write(hwaddr addr, { cpu_physical_memory_rw(addr, (void *)buf, len, true); } +void cpu_reloading_memory_map(void); void *cpu_physical_memory_map(hwaddr addr, hwaddr *plen, bool is_write); @@ -116,6 +151,10 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length); #endif +/* Returns: 0 on success, -1 on error */ +int cpu_memory_rw_debug(CPUState *cpu, vaddr addr, + void *ptr, size_t len, bool is_write); + /* vl.c */ extern int singlestep; diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h index da987fe..6adacf8 100644 --- a/include/exec/cpu_ldst.h +++ b/include/exec/cpu_ldst.h @@ -64,6 +64,7 @@ #include "exec/memopidx.h" #include "qemu/int128.h" +#include "cpu.h" #if defined(CONFIG_USER_ONLY) /* sparc32plus has 64bit long but 32bit space address diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 227e10b..d2cb098 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -24,7 +24,6 @@ #ifdef CONFIG_TCG #include "exec/cpu_ldst.h" #endif -#include "sysemu/cpu-timers.h" /* allow to see translation results - the slowdown should be negligible, so we leave it */ #define DEBUG_DISAS @@ -81,31 +80,6 @@ static inline bool cpu_loop_exit_requested(CPUState *cpu) return (int32_t)qatomic_read(&cpu_neg(cpu)->icount_decr.u32) < 0; } -#if !defined(CONFIG_USER_ONLY) -void cpu_reloading_memory_map(void); -/** - * cpu_address_space_init: - * @cpu: CPU to add this address space to - * @asidx: integer index of this address space - * @prefix: prefix to be used as name of address space - * @mr: the root memory region of address space - * - * Add the specified address space to the CPU's cpu_ases list. - * The address space added with @asidx 0 is the one used for the - * convenience pointer cpu->as. - * The target-specific code which registers ASes is responsible - * for defining what semantics address space 0, 1, 2, etc have. - * - * Before the first call to this function, the caller must set - * cpu->num_ases to the total number of address spaces it needs - * to support. - * - * Note that with KVM only one address space is supported. - */ -void cpu_address_space_init(CPUState *cpu, int asidx, - const char *prefix, MemoryRegion *mr); -#endif - #if !defined(CONFIG_USER_ONLY) && defined(CONFIG_TCG) /* cputlb.c */ /** diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h index a024a03..89edf94 100644 --- a/include/exec/gdbstub.h +++ b/include/exec/gdbstub.h @@ -45,17 +45,6 @@ void gdb_do_syscall(gdb_syscall_complete_cb cb, const char *fmt, ...); */ void gdb_do_syscallv(gdb_syscall_complete_cb cb, const char *fmt, va_list va); int use_gdb_syscalls(void); -void gdb_set_stop_cpu(CPUState *cpu); - -/** - * gdb_exit: exit gdb session, reporting inferior status - * @code: exit code reported - * - * This closes the session and sends a final packet to GDB reporting - * the exit status of the program. It also cleans up any connections - * detritus before returning. - */ -void gdb_exit(int code); #ifdef CONFIG_USER_ONLY /** @@ -165,7 +154,7 @@ static inline uint8_t * gdb_get_reg_ptr(GByteArray *buf, int len) #define ldtul_p(addr) ldl_p(addr) #endif -#endif +#endif /* NEED_CPU_H */ /** * gdbserver_start: start the gdb server @@ -178,6 +167,18 @@ static inline uint8_t * gdb_get_reg_ptr(GByteArray *buf, int len) int gdbserver_start(const char *port_or_device); /** + * gdb_exit: exit gdb session, reporting inferior status + * @code: exit code reported + * + * This closes the session and sends a final packet to GDB reporting + * the exit status of the program. It also cleans up any connections + * detritus before returning. + */ +void gdb_exit(int code); + +void gdb_set_stop_cpu(CPUState *cpu); + +/** * gdb_has_xml: * This is an ugly hack to cope with both new and old gdb. * If gdb sends qXfer:features:read then assume we're talking to a newish diff --git a/include/exec/poison.h b/include/exec/poison.h index 7ad4ad1..7c5c02f 100644 --- a/include/exec/poison.h +++ b/include/exec/poison.h @@ -51,8 +51,6 @@ #pragma GCC poison TARGET_PAGE_BITS #pragma GCC poison TARGET_PAGE_ALIGN -#pragma GCC poison CPUArchState - #pragma GCC poison CPU_INTERRUPT_HARD #pragma GCC poison CPU_INTERRUPT_EXITTB #pragma GCC poison CPU_INTERRUPT_HALT diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index c1ea17d..7e76ee2 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -132,6 +132,7 @@ struct VirtMachineClass { bool no_secure_gpio; /* Machines < 6.2 have no support for describing cpu topology to guest */ bool no_cpu_topology; + bool no_tcg_lpa2; }; struct VirtMachineState { diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 76ab3b8..0efc615 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -22,6 +22,7 @@ #include "hw/qdev-core.h" #include "disas/dis-asm.h" +#include "exec/cpu-common.h" #include "exec/hwaddr.h" #include "exec/memattrs.h" #include "qapi/qapi-types-run-state.h" @@ -36,18 +37,6 @@ typedef int (*WriteCoreDumpFunction)(const void *buf, size_t size, void *opaque); /** - * vaddr: - * Type wide enough to contain any #target_ulong virtual address. - */ -typedef uint64_t vaddr; -#define VADDR_PRId PRId64 -#define VADDR_PRIu PRIu64 -#define VADDR_PRIo PRIo64 -#define VADDR_PRIx PRIx64 -#define VADDR_PRIX PRIX64 -#define VADDR_MAX UINT64_MAX - -/** * SECTION:cpu * @section_id: QEMU-cpu * @title: CPU Class @@ -66,6 +55,24 @@ typedef struct CPUClass CPUClass; DECLARE_CLASS_CHECKERS(CPUClass, CPU, TYPE_CPU) +/** + * OBJECT_DECLARE_CPU_TYPE: + * @CpuInstanceType: instance struct name + * @CpuClassType: class struct name + * @CPU_MODULE_OBJ_NAME: the CPU name in uppercase with underscore separators + * + * This macro is typically used in "cpu-qom.h" header file, and will: + * + * - create the typedefs for the CPU object and class structs + * - register the type for use with g_autoptr + * - provide three standard type cast functions + * + * The object struct and class struct need to be declared manually. + */ +#define OBJECT_DECLARE_CPU_TYPE(CpuInstanceType, CpuClassType, CPU_MODULE_OBJ_NAME) \ + typedef struct ArchCPU CpuInstanceType; \ + OBJECT_DECLARE_TYPE(ArchCPU, CpuClassType, CPU_MODULE_OBJ_NAME); + typedef enum MMUAccessType { MMU_DATA_LOAD = 0, MMU_DATA_STORE = 1, @@ -351,7 +358,7 @@ struct CPUState { AddressSpace *as; MemoryRegion *memory; - void *env_ptr; /* CPUArchState */ + CPUArchState *env_ptr; IcountDecr *icount_decr_ptr; /* Accessed in parallel; all accesses must be atomic */ diff --git a/include/hw/intc/riscv_imsic.h b/include/hw/intc/riscv_imsic.h new file mode 100644 index 0000000..58c2aaa --- /dev/null +++ b/include/hw/intc/riscv_imsic.h @@ -0,0 +1,68 @@ +/* + * RISC-V IMSIC (Incoming Message Signal Interrupt Controller) interface + * + * Copyright (c) 2021 Western Digital Corporation or its affiliates. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef HW_RISCV_IMSIC_H +#define HW_RISCV_IMSIC_H + +#include "hw/sysbus.h" +#include "qom/object.h" + +#define TYPE_RISCV_IMSIC "riscv.imsic" + +typedef struct RISCVIMSICState RISCVIMSICState; +DECLARE_INSTANCE_CHECKER(RISCVIMSICState, RISCV_IMSIC, TYPE_RISCV_IMSIC) + +#define IMSIC_MMIO_PAGE_SHIFT 12 +#define IMSIC_MMIO_PAGE_SZ (1UL << IMSIC_MMIO_PAGE_SHIFT) +#define IMSIC_MMIO_SIZE(__num_pages) ((__num_pages) * IMSIC_MMIO_PAGE_SZ) + +#define IMSIC_MMIO_HART_GUEST_MAX_BTIS 6 +#define IMSIC_MMIO_GROUP_MIN_SHIFT 24 + +#define IMSIC_HART_NUM_GUESTS(__guest_bits) \ + (1U << (__guest_bits)) +#define IMSIC_HART_SIZE(__guest_bits) \ + (IMSIC_HART_NUM_GUESTS(__guest_bits) * IMSIC_MMIO_PAGE_SZ) +#define IMSIC_GROUP_NUM_HARTS(__hart_bits) \ + (1U << (__hart_bits)) +#define IMSIC_GROUP_SIZE(__hart_bits, __guest_bits) \ + (IMSIC_GROUP_NUM_HARTS(__hart_bits) * IMSIC_HART_SIZE(__guest_bits)) + +struct RISCVIMSICState { + /*< private >*/ + SysBusDevice parent_obj; + qemu_irq *external_irqs; + + /*< public >*/ + MemoryRegion mmio; + uint32_t num_eistate; + uint32_t *eidelivery; + uint32_t *eithreshold; + uint32_t *eistate; + + /* config */ + bool mmode; + uint32_t hartid; + uint32_t num_pages; + uint32_t num_irqs; +}; + +DeviceState *riscv_imsic_create(hwaddr addr, uint32_t hartid, bool mmode, + uint32_t num_pages, uint32_t num_ids); + +#endif diff --git a/include/hw/riscv/opentitan.h b/include/hw/riscv/opentitan.h index eac35ef..00da9de 100644 --- a/include/hw/riscv/opentitan.h +++ b/include/hw/riscv/opentitan.h @@ -57,8 +57,10 @@ enum { IBEX_DEV_FLASH, IBEX_DEV_FLASH_VIRTUAL, IBEX_DEV_UART, + IBEX_DEV_SPI_DEVICE, + IBEX_DEV_SPI_HOST0, + IBEX_DEV_SPI_HOST1, IBEX_DEV_GPIO, - IBEX_DEV_SPI, IBEX_DEV_I2C, IBEX_DEV_PATTGEN, IBEX_DEV_TIMER, diff --git a/include/hw/riscv/virt.h b/include/hw/riscv/virt.h index 6e9f61c..78b058e 100644 --- a/include/hw/riscv/virt.h +++ b/include/hw/riscv/virt.h @@ -24,26 +24,36 @@ #include "hw/block/flash.h" #include "qom/object.h" -#define VIRT_CPUS_MAX 32 -#define VIRT_SOCKETS_MAX 8 +#define VIRT_CPUS_MAX_BITS 9 +#define VIRT_CPUS_MAX (1 << VIRT_CPUS_MAX_BITS) +#define VIRT_SOCKETS_MAX_BITS 2 +#define VIRT_SOCKETS_MAX (1 << VIRT_SOCKETS_MAX_BITS) #define TYPE_RISCV_VIRT_MACHINE MACHINE_TYPE_NAME("virt") typedef struct RISCVVirtState RISCVVirtState; DECLARE_INSTANCE_CHECKER(RISCVVirtState, RISCV_VIRT_MACHINE, TYPE_RISCV_VIRT_MACHINE) +typedef enum RISCVVirtAIAType { + VIRT_AIA_TYPE_NONE = 0, + VIRT_AIA_TYPE_APLIC, + VIRT_AIA_TYPE_APLIC_IMSIC, +} RISCVVirtAIAType; + struct RISCVVirtState { /*< private >*/ MachineState parent; /*< public >*/ RISCVHartArrayState soc[VIRT_SOCKETS_MAX]; - DeviceState *plic[VIRT_SOCKETS_MAX]; + DeviceState *irqchip[VIRT_SOCKETS_MAX]; PFlashCFI01 *flash[2]; FWCfgState *fw_cfg; int fdt_size; bool have_aclint; + RISCVVirtAIAType aia_type; + int aia_guests; }; enum { @@ -54,9 +64,13 @@ enum { VIRT_CLINT, VIRT_ACLINT_SSWI, VIRT_PLIC, + VIRT_APLIC_M, + VIRT_APLIC_S, VIRT_UART0, VIRT_VIRTIO, VIRT_FW_CFG, + VIRT_IMSIC_M, + VIRT_IMSIC_S, VIRT_FLASH, VIRT_DRAM, VIRT_PCIE_MMIO, @@ -73,8 +87,13 @@ enum { VIRTIO_NDEV = 0x35 /* Arbitrary maximum number of interrupts */ }; -#define VIRT_PLIC_NUM_SOURCES 127 -#define VIRT_PLIC_NUM_PRIORITIES 7 +#define VIRT_IRQCHIP_IPI_MSI 1 +#define VIRT_IRQCHIP_NUM_MSIS 255 +#define VIRT_IRQCHIP_NUM_SOURCES VIRTIO_NDEV +#define VIRT_IRQCHIP_NUM_PRIO_BITS 3 +#define VIRT_IRQCHIP_MAX_GUESTS_BITS 3 +#define VIRT_IRQCHIP_MAX_GUESTS ((1U << VIRT_IRQCHIP_MAX_GUESTS_BITS) - 1U) + #define VIRT_PLIC_PRIORITY_BASE 0x04 #define VIRT_PLIC_PENDING_BASE 0x1000 #define VIRT_PLIC_ENABLE_BASE 0x2000 @@ -86,9 +105,15 @@ enum { #define FDT_PCI_ADDR_CELLS 3 #define FDT_PCI_INT_CELLS 1 -#define FDT_PLIC_ADDR_CELLS 0 #define FDT_PLIC_INT_CELLS 1 -#define FDT_INT_MAP_WIDTH (FDT_PCI_ADDR_CELLS + FDT_PCI_INT_CELLS + 1 + \ - FDT_PLIC_ADDR_CELLS + FDT_PLIC_INT_CELLS) +#define FDT_APLIC_INT_CELLS 2 +#define FDT_IMSIC_INT_CELLS 0 +#define FDT_MAX_INT_CELLS 2 +#define FDT_MAX_INT_MAP_WIDTH (FDT_PCI_ADDR_CELLS + FDT_PCI_INT_CELLS + \ + 1 + FDT_MAX_INT_CELLS) +#define FDT_PLIC_INT_MAP_WIDTH (FDT_PCI_ADDR_CELLS + FDT_PCI_INT_CELLS + \ + 1 + FDT_PLIC_INT_CELLS) +#define FDT_APLIC_INT_MAP_WIDTH (FDT_PCI_ADDR_CELLS + FDT_PCI_INT_CELLS + \ + 1 + FDT_APLIC_INT_CELLS) #endif diff --git a/include/qemu-common.h b/include/qemu-common.h index 68b2e3b..8c0d9ab 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -26,8 +26,6 @@ int qemu_main(int argc, char **argv, char **envp); #endif -void *qemu_oom_check(void *ptr); - ssize_t qemu_write_full(int fd, const void *buf, size_t count) QEMU_WARN_UNUSED_RESULT; diff --git a/include/qemu/coroutine-tls.h b/include/qemu/coroutine-tls.h new file mode 100644 index 0000000..1558a82 --- /dev/null +++ b/include/qemu/coroutine-tls.h @@ -0,0 +1,165 @@ +/* + * QEMU Thread Local Storage for coroutines + * + * Copyright Red Hat + * + * SPDX-License-Identifier: LGPL-2.1-or-later + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. + * See the COPYING.LIB file in the top-level directory. + * + * It is forbidden to access Thread Local Storage in coroutines because + * compiler optimizations may cause values to be cached across coroutine + * re-entry. Coroutines can run in more than one thread through the course of + * their life, leading bugs when stale TLS values from the wrong thread are + * used as a result of compiler optimization. + * + * An example is: + * + * ..code-block:: c + * :caption: A coroutine that may see the wrong TLS value + * + * static __thread AioContext *current_aio_context; + * ... + * static void coroutine_fn foo(void) + * { + * aio_notify(current_aio_context); + * qemu_coroutine_yield(); + * aio_notify(current_aio_context); // <-- may be stale after yielding! + * } + * + * This header provides macros for safely defining variables in Thread Local + * Storage: + * + * ..code-block:: c + * :caption: A coroutine that safely uses TLS + * + * QEMU_DEFINE_STATIC_CO_TLS(AioContext *, current_aio_context) + * ... + * static void coroutine_fn foo(void) + * { + * aio_notify(get_current_aio_context()); + * qemu_coroutine_yield(); + * aio_notify(get_current_aio_context()); // <-- safe + * } + */ + +#ifndef QEMU_COROUTINE_TLS_H +#define QEMU_COROUTINE_TLS_H + +/* + * To stop the compiler from caching TLS values we define accessor functions + * with __attribute__((noinline)) plus asm volatile("") to prevent + * optimizations that override noinline. + * + * The compiler can still analyze noinline code and make optimizations based on + * that knowledge, so an inline asm output operand is used to prevent + * optimizations that make assumptions about the address of the TLS variable. + * + * This is fragile and ultimately needs to be solved by a mechanism that is + * guaranteed to work by the compiler (e.g. stackless coroutines), but for now + * we use this approach to prevent issues. + */ + +/** + * QEMU_DECLARE_CO_TLS: + * @type: the variable's C type + * @var: the variable name + * + * Declare an extern variable in Thread Local Storage from a header file: + * + * .. code-block:: c + * :caption: Declaring an extern variable in Thread Local Storage + * + * QEMU_DECLARE_CO_TLS(int, my_count) + * ... + * int c = get_my_count(); + * set_my_count(c + 1); + * *get_ptr_my_count() = 0; + * + * This is a coroutine-safe replacement for the __thread keyword and is + * equivalent to the following code: + * + * .. code-block:: c + * :caption: Declaring a TLS variable using __thread + * + * extern __thread int my_count; + * ... + * int c = my_count; + * my_count = c + 1; + * *(&my_count) = 0; + */ +#define QEMU_DECLARE_CO_TLS(type, var) \ + __attribute__((noinline)) type get_##var(void); \ + __attribute__((noinline)) void set_##var(type v); \ + __attribute__((noinline)) type *get_ptr_##var(void); + +/** + * QEMU_DEFINE_CO_TLS: + * @type: the variable's C type + * @var: the variable name + * + * Define a variable in Thread Local Storage that was previously declared from + * a header file with QEMU_DECLARE_CO_TLS(): + * + * .. code-block:: c + * :caption: Defining a variable in Thread Local Storage + * + * QEMU_DEFINE_CO_TLS(int, my_count) + * + * This is a coroutine-safe replacement for the __thread keyword and is + * equivalent to the following code: + * + * .. code-block:: c + * :caption: Defining a TLS variable using __thread + * + * __thread int my_count; + */ +#define QEMU_DEFINE_CO_TLS(type, var) \ + static __thread type co_tls_##var; \ + type get_##var(void) { asm volatile(""); return co_tls_##var; } \ + void set_##var(type v) { asm volatile(""); co_tls_##var = v; } \ + type *get_ptr_##var(void) \ + { type *ptr = &co_tls_##var; asm volatile("" : "+rm" (ptr)); return ptr; } + +/** + * QEMU_DEFINE_STATIC_CO_TLS: + * @type: the variable's C type + * @var: the variable name + * + * Define a static variable in Thread Local Storage: + * + * .. code-block:: c + * :caption: Defining a static variable in Thread Local Storage + * + * QEMU_DEFINE_STATIC_CO_TLS(int, my_count) + * ... + * int c = get_my_count(); + * set_my_count(c + 1); + * *get_ptr_my_count() = 0; + * + * This is a coroutine-safe replacement for the __thread keyword and is + * equivalent to the following code: + * + * .. code-block:: c + * :caption: Defining a static TLS variable using __thread + * + * static __thread int my_count; + * ... + * int c = my_count; + * my_count = c + 1; + * *(&my_count) = 0; + */ +#define QEMU_DEFINE_STATIC_CO_TLS(type, var) \ + static __thread type co_tls_##var; \ + static __attribute__((noinline, unused)) \ + type get_##var(void) \ + { asm volatile(""); return co_tls_##var; } \ + static __attribute__((noinline, unused)) \ + void set_##var(type v) \ + { asm volatile(""); co_tls_##var = v; } \ + static __attribute__((noinline, unused)) \ + type *get_ptr_##var(void) \ + { type *ptr = &co_tls_##var; asm volatile("" : "+rm" (ptr)); return ptr; } + +#endif /* QEMU_COROUTINE_TLS_H */ diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h index 09fc245..7adb12d 100644 --- a/include/qemu/cpuid.h +++ b/include/qemu/cpuid.h @@ -45,12 +45,26 @@ #ifndef bit_AVX2 #define bit_AVX2 (1 << 5) #endif -#ifndef bit_AVX512F -#define bit_AVX512F (1 << 16) -#endif #ifndef bit_BMI2 #define bit_BMI2 (1 << 8) #endif +#ifndef bit_AVX512F +#define bit_AVX512F (1 << 16) +#endif +#ifndef bit_AVX512DQ +#define bit_AVX512DQ (1 << 17) +#endif +#ifndef bit_AVX512BW +#define bit_AVX512BW (1 << 30) +#endif +#ifndef bit_AVX512VL +#define bit_AVX512VL (1u << 31) +#endif + +/* Leaf 7, %ecx */ +#ifndef bit_AVX512VBMI2 +#define bit_AVX512VBMI2 (1 << 6) +#endif /* Leaf 0x80000001, %ecx */ #ifndef bit_LZCNT diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h index 5e71b6d..5bd986a 100644 --- a/include/qemu/hbitmap.h +++ b/include/qemu/hbitmap.h @@ -340,6 +340,18 @@ bool hbitmap_next_dirty_area(const HBitmap *hb, int64_t start, int64_t end, int64_t max_dirty_count, int64_t *dirty_start, int64_t *dirty_count); +/* + * bdrv_dirty_bitmap_status: + * @hb: The HBitmap to operate on + * @start: The bit to start from + * @count: Number of bits to proceed + * @pnum: Out-parameter. How many bits has same value starting from @start + * + * Returns true if bitmap is dirty at @start, false otherwise. + */ +bool hbitmap_status(const HBitmap *hb, int64_t start, int64_t count, + int64_t *pnum); + /** * hbitmap_iter_next: * @hbi: HBitmapIter to operate on. diff --git a/include/qemu/job.h b/include/qemu/job.h index 6e67b69..c105b31 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -169,6 +169,12 @@ typedef struct Job { * Callbacks and other information about a Job driver. */ struct JobDriver { + + /* + * These fields are initialized when this object is created, + * and are never changed afterwards + */ + /** Derived Job struct size */ size_t instance_size; @@ -184,9 +190,18 @@ struct JobDriver { * aborted. If it returns zero, the job moves into the WAITING state. If it * is the last job to complete in its transaction, all jobs in the * transaction move from WAITING to PENDING. + * + * This callback must be run in the job's context. */ int coroutine_fn (*run)(Job *job, Error **errp); + /* + * Functions run without regard to the BQL that may run in any + * arbitrary thread. These functions do not need to be thread-safe + * because the caller ensures that they are invoked from one + * thread at time. + */ + /** * If the callback is not NULL, it will be invoked when the job transitions * into the paused state. Paused jobs must not perform any asynchronous @@ -201,6 +216,13 @@ struct JobDriver { */ void coroutine_fn (*resume)(Job *job); + /* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + /** * Called when the job is resumed by the user (i.e. user_paused becomes * false). .user_resume is called before .resume. diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h index 8dbc6fc..7a4d6a0 100644 --- a/include/qemu/main-loop.h +++ b/include/qemu/main-loop.h @@ -242,10 +242,52 @@ AioContext *iohandler_get_aio_context(void); * must always be taken outside other locks. This function helps * functions take different paths depending on whether the current * thread is running within the main loop mutex. + * + * This function should never be used in the block layer, because + * unit tests, block layer tools and qemu-storage-daemon do not + * have a BQL. + * Please instead refer to qemu_in_main_thread(). */ bool qemu_mutex_iothread_locked(void); /** + * qemu_in_main_thread: return whether it's possible to safely access + * the global state of the block layer. + * + * Global state of the block layer is not accessible from I/O threads + * or worker threads; only from threads that "own" the default + * AioContext that qemu_get_aio_context() returns. For tests, block + * layer tools and qemu-storage-daemon there is a designated thread that + * runs the event loop for qemu_get_aio_context(), and that is the + * main thread. + * + * For emulators, however, any thread that holds the BQL can act + * as the block layer main thread; this will be any of the actual + * main thread, the vCPU threads or the RCU thread. + * + * For clarity, do not use this function outside the block layer. + */ +bool qemu_in_main_thread(void); + +/* Mark and check that the function is part of the global state API. */ +#define GLOBAL_STATE_CODE() \ + do { \ + assert(qemu_in_main_thread()); \ + } while (0) + +/* Mark and check that the function is part of the I/O API. */ +#define IO_CODE() \ + do { \ + /* nop */ \ + } while (0) + +/* Mark and check that the function is part of the "I/O OR GS" API. */ +#define IO_OR_GS_CODE() \ + do { \ + /* nop */ \ + } while (0) + +/** * qemu_mutex_lock_iothread: Lock the main loop mutex. * * This function locks the main loop mutex. The mutex is taken by diff --git a/include/qemu/memalign.h b/include/qemu/memalign.h new file mode 100644 index 0000000..fa299f3 --- /dev/null +++ b/include/qemu/memalign.h @@ -0,0 +1,61 @@ +/* + * Allocation and free functions for aligned memory + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_MEMALIGN_H +#define QEMU_MEMALIGN_H + +/** + * qemu_try_memalign: Allocate aligned memory + * @alignment: required alignment, in bytes + * @size: size of allocation, in bytes + * + * Allocate memory on an aligned boundary (i.e. the returned + * address will be an exact multiple of @alignment). + * @alignment must be a power of 2, or the function will assert(). + * On success, returns allocated memory; on failure, returns NULL. + * + * The memory allocated through this function must be freed via + * qemu_vfree() (and not via free()). + */ +void *qemu_try_memalign(size_t alignment, size_t size); +/** + * qemu_memalign: Allocate aligned memory, without failing + * @alignment: required alignment, in bytes + * @size: size of allocation, in bytes + * + * Allocate memory in the same way as qemu_try_memalign(), but + * abort() with an error message if the memory allocation fails. + * + * The memory allocated through this function must be freed via + * qemu_vfree() (and not via free()). + */ +void *qemu_memalign(size_t alignment, size_t size); +/** + * qemu_vfree: Free memory allocated through qemu_memalign + * @ptr: memory to free + * + * This function must be used to free memory allocated via qemu_memalign() + * or qemu_try_memalign(). (Using the wrong free function will cause + * subtle bugs on Windows hosts.) + */ +void qemu_vfree(void *ptr); +/* + * It's an analog of GLIB's g_autoptr_cleanup_generic_gfree(), used to define + * g_autofree macro. + */ +static inline void qemu_cleanup_generic_vfree(void *p) +{ + void **pp = (void **)p; + qemu_vfree(*pp); +} + +/* + * Analog of g_autofree, but qemu_vfree is called on cleanup instead of g_free. + */ +#define QEMU_AUTO_VFREE __attribute__((cleanup(qemu_cleanup_generic_vfree))) + +#endif diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 7bcce3b..c9ec783 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -379,28 +379,10 @@ extern "C" { #endif int qemu_daemon(int nochdir, int noclose); -void *qemu_try_memalign(size_t alignment, size_t size); -void *qemu_memalign(size_t alignment, size_t size); void *qemu_anon_ram_alloc(size_t size, uint64_t *align, bool shared, bool noreserve); -void qemu_vfree(void *ptr); void qemu_anon_ram_free(void *ptr, size_t size); -/* - * It's an analog of GLIB's g_autoptr_cleanup_generic_gfree(), used to define - * g_autofree macro. - */ -static inline void qemu_cleanup_generic_vfree(void *p) -{ - void **pp = (void **)p; - qemu_vfree(*pp); -} - -/* - * Analog of g_autofree, but qemu_vfree is called on cleanup instead of g_free. - */ -#define QEMU_AUTO_VFREE __attribute__((cleanup(qemu_cleanup_generic_vfree))) - #ifdef _WIN32 #define HAVE_CHARDEV_SERIAL 1 #elif defined(__linux__) || defined(__sun__) || defined(__FreeBSD__) \ @@ -673,19 +655,6 @@ static inline int platform_does_not_support_system(const char *command) } #endif /* !HAVE_SYSTEM_FUNCTION */ -/** - * Duplicate directory entry @dent. - * - * It is highly recommended to use this function instead of open coding - * duplication of @c dirent objects, because the actual @c struct @c dirent - * size may be bigger or shorter than @c sizeof(struct dirent) and correct - * handling is platform specific (see gitlab issue #841). - * - * @dent - original directory entry to be duplicated - * @returns duplicated directory entry which should be freed with g_free() - */ -struct dirent *qemu_dirent_dup(struct dirent *dent); - #ifdef __cplusplus } #endif diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h index e69efbd..b063c6f 100644 --- a/include/qemu/rcu.h +++ b/include/qemu/rcu.h @@ -29,6 +29,7 @@ #include "qemu/atomic.h" #include "qemu/notify.h" #include "qemu/sys_membarrier.h" +#include "qemu/coroutine-tls.h" #ifdef __cplusplus extern "C" { @@ -76,11 +77,11 @@ struct rcu_reader_data { NotifierList force_rcu; }; -extern __thread struct rcu_reader_data rcu_reader; +QEMU_DECLARE_CO_TLS(struct rcu_reader_data, rcu_reader) static inline void rcu_read_lock(void) { - struct rcu_reader_data *p_rcu_reader = &rcu_reader; + struct rcu_reader_data *p_rcu_reader = get_ptr_rcu_reader(); unsigned ctr; if (p_rcu_reader->depth++ > 0) { @@ -96,7 +97,7 @@ static inline void rcu_read_lock(void) static inline void rcu_read_unlock(void) { - struct rcu_reader_data *p_rcu_reader = &rcu_reader; + struct rcu_reader_data *p_rcu_reader = get_ptr_rcu_reader(); assert(p_rcu_reader->depth != 0); if (--p_rcu_reader->depth > 0) { diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 5b302cb..42f4ceb 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -26,6 +26,7 @@ typedef struct AddressSpace AddressSpace; typedef struct AioContext AioContext; typedef struct Aml Aml; typedef struct AnnounceTimer AnnounceTimer; +typedef struct ArchCPU ArchCPU; typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; typedef struct BdrvDirtyBitmapIter BdrvDirtyBitmapIter; typedef struct BlockBackend BlockBackend; @@ -39,6 +40,7 @@ typedef struct CompatProperty CompatProperty; typedef struct CoMutex CoMutex; typedef struct ConfidentialGuestSupport ConfidentialGuestSupport; typedef struct CPUAddressSpace CPUAddressSpace; +typedef struct CPUArchState CPUArchState; typedef struct CPUState CPUState; typedef struct DeviceListener DeviceListener; typedef struct DeviceState DeviceState; diff --git a/include/qemu/xattr.h b/include/qemu/xattr.h index a83fe8e..f1d0f7b 100644 --- a/include/qemu/xattr.h +++ b/include/qemu/xattr.h @@ -22,7 +22,9 @@ #ifdef CONFIG_LIBATTR # include <attr/xattr.h> #else -# define ENOATTR ENODATA +# if !defined(ENOATTR) +# define ENOATTR ENODATA +# endif # include <sys/xattr.h> #endif diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h index 032f697..6013c94 100644 --- a/include/sysemu/accel-ops.h +++ b/include/sysemu/accel-ops.h @@ -28,8 +28,11 @@ struct AccelOpsClass { /* initialization function called when accel is chosen */ void (*ops_init)(AccelOpsClass *ops); + bool (*cpus_are_resettable)(void); + void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY NON-NULL */ void (*kick_vcpu_thread)(CPUState *cpu); + bool (*cpu_thread_is_idle)(CPUState *cpu); void (*synchronize_post_reset)(CPUState *cpu); void (*synchronize_post_init)(CPUState *cpu); diff --git a/include/sysemu/arch_init.h b/include/sysemu/arch_init.h index 70c5795..79c2591 100644 --- a/include/sysemu/arch_init.h +++ b/include/sysemu/arch_init.h @@ -28,4 +28,6 @@ enum { extern const uint32_t arch_type; +void qemu_init_arch_modules(void); + #endif diff --git a/include/sysemu/block-backend-common.h b/include/sysemu/block-backend-common.h new file mode 100644 index 0000000..2391679 --- /dev/null +++ b/include/sysemu/block-backend-common.h @@ -0,0 +1,102 @@ +/* + * QEMU Block backends + * + * Copyright (C) 2014-2016 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later. See the COPYING.LIB file in the top-level directory. + */ + +#ifndef BLOCK_BACKEND_COMMON_H +#define BLOCK_BACKEND_COMMON_H + +#include "qemu/iov.h" +#include "block/throttle-groups.h" + +/* + * TODO Have to include block/block.h for a bunch of block layer + * types. Unfortunately, this pulls in the whole BlockDriverState + * API, which we don't want used by many BlockBackend users. Some of + * the types belong here, and the rest should be split into a common + * header and one for the BlockDriverState API. + */ +#include "block/block.h" + +/* Callbacks for block device models */ +typedef struct BlockDevOps { + + /* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + + /* + * Runs when virtual media changed (monitor commands eject, change) + * Argument load is true on load and false on eject. + * Beware: doesn't run when a host device's physical media + * changes. Sure would be useful if it did. + * Device models with removable media must implement this callback. + */ + void (*change_media_cb)(void *opaque, bool load, Error **errp); + /* + * Runs when an eject request is issued from the monitor, the tray + * is closed, and the medium is locked. + * Device models that do not implement is_medium_locked will not need + * this callback. Device models that can lock the medium or tray might + * want to implement the callback and unlock the tray when "force" is + * true, even if they do not support eject requests. + */ + void (*eject_request_cb)(void *opaque, bool force); + + /* + * Is the virtual medium locked into the device? + * Device models implement this only when device has such a lock. + */ + bool (*is_medium_locked)(void *opaque); + + /* + * I/O API functions. These functions are thread-safe. + * + * See include/block/block-io.h for more information about + * the I/O API. + */ + + /* + * Is the virtual tray open? + * Device models implement this only when the device has a tray. + */ + bool (*is_tray_open)(void *opaque); + + /* + * Runs when the size changed (e.g. monitor command block_resize) + */ + void (*resize_cb)(void *opaque); + /* + * Runs when the backend receives a drain request. + */ + void (*drained_begin)(void *opaque); + /* + * Runs when the backend's last drain request ends. + */ + void (*drained_end)(void *opaque); + /* + * Is the device still busy? + */ + bool (*drained_poll)(void *opaque); +} BlockDevOps; + +/* + * This struct is embedded in (the private) BlockBackend struct and contains + * fields that must be public. This is in particular for QLIST_ENTRY() and + * friends so that BlockBackends can be kept in lists outside block-backend.c + */ +typedef struct BlockBackendPublic { + ThrottleGroupMember throttle_group_member; +} BlockBackendPublic; + +#endif /* BLOCK_BACKEND_COMMON_H */ diff --git a/include/sysemu/block-backend-global-state.h b/include/sysemu/block-backend-global-state.h new file mode 100644 index 0000000..2e93a74 --- /dev/null +++ b/include/sysemu/block-backend-global-state.h @@ -0,0 +1,116 @@ +/* + * QEMU Block backends + * + * Copyright (C) 2014-2016 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later. See the COPYING.LIB file in the top-level directory. + */ + +#ifndef BLOCK_BACKEND_GS_H +#define BLOCK_BACKEND_GS_H + +#include "block-backend-common.h" + +/* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + +BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm); +BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm, + uint64_t shared_perm, Error **errp); +BlockBackend *blk_new_open(const char *filename, const char *reference, + QDict *options, int flags, Error **errp); +int blk_get_refcnt(BlockBackend *blk); +void blk_ref(BlockBackend *blk); +void blk_unref(BlockBackend *blk); +void blk_remove_all_bs(void); +BlockBackend *blk_by_name(const char *name); +BlockBackend *blk_next(BlockBackend *blk); +BlockBackend *blk_all_next(BlockBackend *blk); +bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp); +void monitor_remove_blk(BlockBackend *blk); + +BlockBackendPublic *blk_get_public(BlockBackend *blk); +BlockBackend *blk_by_public(BlockBackendPublic *public); + +void blk_remove_bs(BlockBackend *blk); +int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp); +int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp); +bool bdrv_has_blk(BlockDriverState *bs); +bool bdrv_is_root_node(BlockDriverState *bs); +int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm, + Error **errp); +void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm); + +void blk_iostatus_enable(BlockBackend *blk); +BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk); +void blk_iostatus_disable(BlockBackend *blk); +void blk_iostatus_reset(BlockBackend *blk); +int blk_attach_dev(BlockBackend *blk, DeviceState *dev); +void blk_detach_dev(BlockBackend *blk, DeviceState *dev); +DeviceState *blk_get_attached_dev(BlockBackend *blk); +BlockBackend *blk_by_dev(void *dev); +BlockBackend *blk_by_qdev_id(const char *id, Error **errp); +void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque); + +void blk_activate(BlockBackend *blk, Error **errp); + +int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags); +void blk_aio_cancel(BlockAIOCB *acb); +int blk_commit_all(void); +void blk_drain(BlockBackend *blk); +void blk_drain_all(void); +void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, + BlockdevOnError on_write_error); +bool blk_supports_write_perm(BlockBackend *blk); +bool blk_is_sg(BlockBackend *blk); +void blk_set_enable_write_cache(BlockBackend *blk, bool wce); +int blk_get_flags(BlockBackend *blk); +bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp); +void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason); +void blk_op_block_all(BlockBackend *blk, Error *reason); +void blk_op_unblock_all(BlockBackend *blk, Error *reason); +int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, + Error **errp); +void blk_add_aio_context_notifier(BlockBackend *blk, + void (*attached_aio_context)(AioContext *new_context, void *opaque), + void (*detach_aio_context)(void *opaque), void *opaque); +void blk_remove_aio_context_notifier(BlockBackend *blk, + void (*attached_aio_context)(AioContext *, + void *), + void (*detach_aio_context)(void *), + void *opaque); +void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify); +void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify); +BlockBackendRootState *blk_get_root_state(BlockBackend *blk); +void blk_update_root_state(BlockBackend *blk); +bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk); +int blk_get_open_flags_from_root_state(BlockBackend *blk); + +int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, + int64_t pos, int size); +int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size); +int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz); +int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo); + +void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg); +void blk_io_limits_disable(BlockBackend *blk); +void blk_io_limits_enable(BlockBackend *blk, const char *group); +void blk_io_limits_update_group(BlockBackend *blk, const char *group); +void blk_set_force_allow_inactivate(BlockBackend *blk); + +void blk_register_buf(BlockBackend *blk, void *host, size_t size); +void blk_unregister_buf(BlockBackend *blk, void *host); + +const BdrvChild *blk_root(BlockBackend *blk); + +int blk_make_empty(BlockBackend *blk, Error **errp); + +#endif /* BLOCK_BACKEND_GS_H */ diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h new file mode 100644 index 0000000..6517c39 --- /dev/null +++ b/include/sysemu/block-backend-io.h @@ -0,0 +1,161 @@ +/* + * QEMU Block backends + * + * Copyright (C) 2014-2016 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later. See the COPYING.LIB file in the top-level directory. + */ + +#ifndef BLOCK_BACKEND_IO_H +#define BLOCK_BACKEND_IO_H + +#include "block-backend-common.h" + +/* + * I/O API functions. These functions are thread-safe. + * + * See include/block/block-io.h for more information about + * the I/O API. + */ + +const char *blk_name(const BlockBackend *blk); + +BlockDriverState *blk_bs(BlockBackend *blk); + +void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow); +void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow); +void blk_set_disable_request_queuing(BlockBackend *blk, bool disable); +bool blk_iostatus_is_enabled(const BlockBackend *blk); + +char *blk_get_attached_dev_id(BlockBackend *blk); + +BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int64_t bytes, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); + +BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_flush(BlockBackend *blk, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, + BlockCompletionFunc *cb, void *opaque); +void blk_aio_cancel_async(BlockAIOCB *acb); +BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque); + +void blk_inc_in_flight(BlockBackend *blk); +void blk_dec_in_flight(BlockBackend *blk); +bool blk_is_inserted(BlockBackend *blk); +bool blk_is_available(BlockBackend *blk); +void blk_lock_medium(BlockBackend *blk, bool locked); +void blk_eject(BlockBackend *blk, bool eject_flag); +int64_t blk_getlength(BlockBackend *blk); +void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr); +int64_t blk_nb_sectors(BlockBackend *blk); +void *blk_try_blockalign(BlockBackend *blk, size_t size); +void *blk_blockalign(BlockBackend *blk, size_t size); +bool blk_is_writable(BlockBackend *blk); +bool blk_enable_write_cache(BlockBackend *blk); +BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read); +BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, + int error); +void blk_error_action(BlockBackend *blk, BlockErrorAction action, + bool is_read, int error); +void blk_iostatus_set_err(BlockBackend *blk, int error); +int blk_get_max_iov(BlockBackend *blk); +int blk_get_max_hw_iov(BlockBackend *blk); +void blk_set_guest_block_size(BlockBackend *blk, int align); + +void blk_io_plug(BlockBackend *blk); +void blk_io_unplug(BlockBackend *blk); +AioContext *blk_get_aio_context(BlockBackend *blk); +BlockAcctStats *blk_get_stats(BlockBackend *blk); +void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, + BlockCompletionFunc *cb, + void *opaque, int ret); + +uint32_t blk_get_request_alignment(BlockBackend *blk); +uint32_t blk_get_max_transfer(BlockBackend *blk); +uint64_t blk_get_max_hw_transfer(BlockBackend *blk); + +int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, + BlockBackend *blk_out, int64_t off_out, + int64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + + +/* + * "I/O or GS" API functions. These functions can run without + * the BQL, but only in one specific iothread/main loop. + * + * See include/block/block-io.h for more information about + * the "I/O or GS" API. + */ + +int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int bytes); +int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int bytes, + BdrvRequestFlags flags); +int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, + int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, + BdrvRequestFlags flags); +int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); + +static inline int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset, + int64_t bytes, void *buf, + BdrvRequestFlags flags) +{ + QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); + IO_OR_GS_CODE(); + + assert(bytes <= SIZE_MAX); + + return blk_co_preadv(blk, offset, bytes, &qiov, flags); +} + +static inline int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset, + int64_t bytes, void *buf, + BdrvRequestFlags flags) +{ + QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); + IO_OR_GS_CODE(); + + assert(bytes <= SIZE_MAX); + + return blk_co_pwritev(blk, offset, bytes, &qiov, flags); +} + +int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, + int64_t bytes); + +int coroutine_fn blk_co_flush(BlockBackend *blk); +int blk_flush(BlockBackend *blk); + +int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf); + +int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf, + int64_t bytes); +int blk_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes); +int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int64_t bytes, BdrvRequestFlags flags); +int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int64_t bytes, BdrvRequestFlags flags); +int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, + PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); + +#endif /* BLOCK_BACKEND_IO_H */ diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index e5e1524..038be9f 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -13,272 +13,9 @@ #ifndef BLOCK_BACKEND_H #define BLOCK_BACKEND_H -#include "qemu/iov.h" -#include "block/throttle-groups.h" +#include "block-backend-global-state.h" +#include "block-backend-io.h" -/* - * TODO Have to include block/block.h for a bunch of block layer - * types. Unfortunately, this pulls in the whole BlockDriverState - * API, which we don't want used by many BlockBackend users. Some of - * the types belong here, and the rest should be split into a common - * header and one for the BlockDriverState API. - */ -#include "block/block.h" - -/* Callbacks for block device models */ -typedef struct BlockDevOps { - /* - * Runs when virtual media changed (monitor commands eject, change) - * Argument load is true on load and false on eject. - * Beware: doesn't run when a host device's physical media - * changes. Sure would be useful if it did. - * Device models with removable media must implement this callback. - */ - void (*change_media_cb)(void *opaque, bool load, Error **errp); - /* - * Runs when an eject request is issued from the monitor, the tray - * is closed, and the medium is locked. - * Device models that do not implement is_medium_locked will not need - * this callback. Device models that can lock the medium or tray might - * want to implement the callback and unlock the tray when "force" is - * true, even if they do not support eject requests. - */ - void (*eject_request_cb)(void *opaque, bool force); - /* - * Is the virtual tray open? - * Device models implement this only when the device has a tray. - */ - bool (*is_tray_open)(void *opaque); - /* - * Is the virtual medium locked into the device? - * Device models implement this only when device has such a lock. - */ - bool (*is_medium_locked)(void *opaque); - /* - * Runs when the size changed (e.g. monitor command block_resize) - */ - void (*resize_cb)(void *opaque); - /* - * Runs when the backend receives a drain request. - */ - void (*drained_begin)(void *opaque); - /* - * Runs when the backend's last drain request ends. - */ - void (*drained_end)(void *opaque); - /* - * Is the device still busy? - */ - bool (*drained_poll)(void *opaque); -} BlockDevOps; - -/* This struct is embedded in (the private) BlockBackend struct and contains - * fields that must be public. This is in particular for QLIST_ENTRY() and - * friends so that BlockBackends can be kept in lists outside block-backend.c - * */ -typedef struct BlockBackendPublic { - ThrottleGroupMember throttle_group_member; -} BlockBackendPublic; - -BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm); -BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm, - uint64_t shared_perm, Error **errp); -BlockBackend *blk_new_open(const char *filename, const char *reference, - QDict *options, int flags, Error **errp); -int blk_get_refcnt(BlockBackend *blk); -void blk_ref(BlockBackend *blk); -void blk_unref(BlockBackend *blk); -void blk_remove_all_bs(void); -const char *blk_name(const BlockBackend *blk); -BlockBackend *blk_by_name(const char *name); -BlockBackend *blk_next(BlockBackend *blk); -BlockBackend *blk_all_next(BlockBackend *blk); -bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp); -void monitor_remove_blk(BlockBackend *blk); - -BlockBackendPublic *blk_get_public(BlockBackend *blk); -BlockBackend *blk_by_public(BlockBackendPublic *public); - -BlockDriverState *blk_bs(BlockBackend *blk); -void blk_remove_bs(BlockBackend *blk); -int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp); -int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp); -bool bdrv_has_blk(BlockDriverState *bs); -bool bdrv_is_root_node(BlockDriverState *bs); -int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm, - Error **errp); -void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm); - -void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow); -void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow); -void blk_set_disable_request_queuing(BlockBackend *blk, bool disable); -void blk_iostatus_enable(BlockBackend *blk); -bool blk_iostatus_is_enabled(const BlockBackend *blk); -BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk); -void blk_iostatus_disable(BlockBackend *blk); -void blk_iostatus_reset(BlockBackend *blk); -void blk_iostatus_set_err(BlockBackend *blk, int error); -int blk_attach_dev(BlockBackend *blk, DeviceState *dev); -void blk_detach_dev(BlockBackend *blk, DeviceState *dev); -DeviceState *blk_get_attached_dev(BlockBackend *blk); -char *blk_get_attached_dev_id(BlockBackend *blk); -BlockBackend *blk_by_dev(void *dev); -BlockBackend *blk_by_qdev_id(const char *id, Error **errp); -void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque); -int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, - int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, - int64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, - BdrvRequestFlags flags); -int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, - int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); - -static inline int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset, - int64_t bytes, void *buf, - BdrvRequestFlags flags) -{ - QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); - - assert(bytes <= SIZE_MAX); - - return blk_co_preadv(blk, offset, bytes, &qiov, flags); -} - -static inline int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset, - int64_t bytes, void *buf, - BdrvRequestFlags flags) -{ - QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); - - assert(bytes <= SIZE_MAX); - - return blk_co_pwritev(blk, offset, bytes, &qiov, flags); -} - -int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, - int64_t bytes, BdrvRequestFlags flags); -BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset, - int64_t bytes, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque); -int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags); -int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int bytes); -int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int bytes, - BdrvRequestFlags flags); -int64_t blk_getlength(BlockBackend *blk); -void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr); -int64_t blk_nb_sectors(BlockBackend *blk); -BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset, - QEMUIOVector *qiov, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque); -BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, - QEMUIOVector *qiov, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque); -BlockAIOCB *blk_aio_flush(BlockBackend *blk, - BlockCompletionFunc *cb, void *opaque); -BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, - BlockCompletionFunc *cb, void *opaque); -void blk_aio_cancel(BlockAIOCB *acb); -void blk_aio_cancel_async(BlockAIOCB *acb); -int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf); -BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque); -int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, - int64_t bytes); -int coroutine_fn blk_co_flush(BlockBackend *blk); -int blk_flush(BlockBackend *blk); -int blk_commit_all(void); -void blk_inc_in_flight(BlockBackend *blk); -void blk_dec_in_flight(BlockBackend *blk); -void blk_drain(BlockBackend *blk); -void blk_drain_all(void); -void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, - BlockdevOnError on_write_error); -BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read); -BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, - int error); -void blk_error_action(BlockBackend *blk, BlockErrorAction action, - bool is_read, int error); -bool blk_supports_write_perm(BlockBackend *blk); -bool blk_is_writable(BlockBackend *blk); -bool blk_is_sg(BlockBackend *blk); -bool blk_enable_write_cache(BlockBackend *blk); -void blk_set_enable_write_cache(BlockBackend *blk, bool wce); -void blk_invalidate_cache(BlockBackend *blk, Error **errp); -bool blk_is_inserted(BlockBackend *blk); -bool blk_is_available(BlockBackend *blk); -void blk_lock_medium(BlockBackend *blk, bool locked); -void blk_eject(BlockBackend *blk, bool eject_flag); -int blk_get_flags(BlockBackend *blk); -uint32_t blk_get_request_alignment(BlockBackend *blk); -uint32_t blk_get_max_transfer(BlockBackend *blk); -uint64_t blk_get_max_hw_transfer(BlockBackend *blk); -int blk_get_max_iov(BlockBackend *blk); -int blk_get_max_hw_iov(BlockBackend *blk); -void blk_set_guest_block_size(BlockBackend *blk, int align); -void *blk_try_blockalign(BlockBackend *blk, size_t size); -void *blk_blockalign(BlockBackend *blk, size_t size); -bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp); -void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason); -void blk_op_block_all(BlockBackend *blk, Error *reason); -void blk_op_unblock_all(BlockBackend *blk, Error *reason); -AioContext *blk_get_aio_context(BlockBackend *blk); -int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, - Error **errp); -void blk_add_aio_context_notifier(BlockBackend *blk, - void (*attached_aio_context)(AioContext *new_context, void *opaque), - void (*detach_aio_context)(void *opaque), void *opaque); -void blk_remove_aio_context_notifier(BlockBackend *blk, - void (*attached_aio_context)(AioContext *, - void *), - void (*detach_aio_context)(void *), - void *opaque); -void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify); -void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify); -void blk_io_plug(BlockBackend *blk); -void blk_io_unplug(BlockBackend *blk); -BlockAcctStats *blk_get_stats(BlockBackend *blk); -BlockBackendRootState *blk_get_root_state(BlockBackend *blk); -void blk_update_root_state(BlockBackend *blk); -bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk); -int blk_get_open_flags_from_root_state(BlockBackend *blk); - -void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, - BlockCompletionFunc *cb, void *opaque); -int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, - int64_t bytes, BdrvRequestFlags flags); -int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf, - int64_t bytes); -int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, - PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); -int blk_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes); -int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, - int64_t pos, int size); -int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size); -int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz); -int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo); -BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, - BlockCompletionFunc *cb, - void *opaque, int ret); - -void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg); -void blk_io_limits_disable(BlockBackend *blk); -void blk_io_limits_enable(BlockBackend *blk, const char *group); -void blk_io_limits_update_group(BlockBackend *blk, const char *group); -void blk_set_force_allow_inactivate(BlockBackend *blk); - -void blk_register_buf(BlockBackend *blk, void *host, size_t size); -void blk_unregister_buf(BlockBackend *blk, void *host); - -int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, - BlockBackend *blk_out, int64_t off_out, - int64_t bytes, BdrvRequestFlags read_flags, - BdrvRequestFlags write_flags); - -const BdrvChild *blk_root(BlockBackend *blk); - -int blk_make_empty(BlockBackend *blk, Error **errp); +/* DO NOT ADD ANYTHING IN HERE. USE ONE OF THE HEADERS INCLUDED ABOVE */ #endif diff --git a/include/sysemu/blockdev.h b/include/sysemu/blockdev.h index f9fb54d..3211b16 100644 --- a/include/sysemu/blockdev.h +++ b/include/sysemu/blockdev.h @@ -13,9 +13,6 @@ #include "block/block.h" #include "qemu/queue.h" -void blockdev_mark_auto_del(BlockBackend *blk); -void blockdev_auto_del(BlockBackend *blk); - typedef enum { IF_DEFAULT = -1, /* for use with drive_add() only */ /* @@ -38,6 +35,16 @@ struct DriveInfo { QTAILQ_ENTRY(DriveInfo) next; }; +/* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + +void blockdev_mark_auto_del(BlockBackend *blk); +void blockdev_auto_del(BlockBackend *blk); + DriveInfo *blk_legacy_dinfo(BlockBackend *blk); DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo); BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo); diff --git a/include/sysemu/hax.h b/include/sysemu/hax.h index 247f066..bf8f99a 100644 --- a/include/sysemu/hax.h +++ b/include/sysemu/hax.h @@ -25,17 +25,23 @@ int hax_sync_vcpus(void); #ifdef NEED_CPU_H +# ifdef CONFIG_HAX +# define CONFIG_HAX_IS_POSSIBLE +# endif +#else /* !NEED_CPU_H */ +# define CONFIG_HAX_IS_POSSIBLE +#endif -#ifdef CONFIG_HAX +#ifdef CONFIG_HAX_IS_POSSIBLE -int hax_enabled(void); +extern bool hax_allowed; -#else /* CONFIG_HAX */ +#define hax_enabled() (hax_allowed) -#define hax_enabled() (0) +#else /* !CONFIG_HAX_IS_POSSIBLE */ -#endif /* CONFIG_HAX */ +#define hax_enabled() (0) -#endif /* NEED_CPU_H */ +#endif /* CONFIG_HAX_IS_POSSIBLE */ #endif /* QEMU_HAX_H */ diff --git a/include/sysemu/hw_accel.h b/include/sysemu/hw_accel.h index 01b5ebf..22903a5 100644 --- a/include/sysemu/hw_accel.h +++ b/include/sysemu/hw_accel.h @@ -23,9 +23,4 @@ void cpu_synchronize_post_reset(CPUState *cpu); void cpu_synchronize_post_init(CPUState *cpu); void cpu_synchronize_pre_loadvm(CPUState *cpu); -static inline bool cpu_check_are_resettable(void) -{ - return kvm_enabled() ? kvm_cpu_check_are_resettable() : true; -} - #endif /* QEMU_HW_ACCEL_H */ diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 6eb39a0..a5bec96 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -249,6 +249,9 @@ int kvm_has_intx_set_mask(void); bool kvm_arm_supports_user_irq(void); +int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); +int kvm_on_sigbus(int code, void *addr); + #ifdef NEED_CPU_H #include "cpu.h" @@ -261,9 +264,6 @@ int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, void kvm_remove_all_breakpoints(CPUState *cpu); int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap); -int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); -int kvm_on_sigbus(int code, void *addr); - /* internal API */ int kvm_ioctl(KVMState *s, int type, ...); diff --git a/include/sysemu/memory_mapping.h b/include/sysemu/memory_mapping.h index 4b20f1a..3bbeb1b 100644 --- a/include/sysemu/memory_mapping.h +++ b/include/sysemu/memory_mapping.h @@ -15,8 +15,7 @@ #define MEMORY_MAPPING_H #include "qemu/queue.h" -#include "exec/cpu-defs.h" -#include "exec/memory.h" +#include "exec/cpu-common.h" typedef struct GuestPhysBlock { /* visible to guest, reflects PCI hole, etc */ @@ -43,7 +42,7 @@ typedef struct GuestPhysBlockList { /* The physical and virtual address in the memory mapping are contiguous. */ typedef struct MemoryMapping { hwaddr phys_addr; - target_ulong virt_addr; + vaddr virt_addr; ram_addr_t length; QTAILQ_ENTRY(MemoryMapping) next; } MemoryMapping; diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h index 2edf336..dd64fb4 100644 --- a/include/sysemu/os-posix.h +++ b/include/sysemu/os-posix.h @@ -55,6 +55,7 @@ int os_mlock(void); typedef struct timeval qemu_timeval; #define qemu_gettimeofday(tp) gettimeofday(tp, NULL) +int os_set_daemonize(bool d); bool is_daemonized(void); /** diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h index 43f569b..7707522 100644 --- a/include/sysemu/os-win32.h +++ b/include/sysemu/os-win32.h @@ -77,6 +77,14 @@ typedef struct { } qemu_timeval; int qemu_gettimeofday(qemu_timeval *tp); +static inline int os_set_daemonize(bool d) +{ + if (d) { + return -ENOTSUP; + } + return 0; +} + static inline bool is_daemonized(void) { return false; diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h index 675873e..dd44473 100644 --- a/include/tcg/tcg-opc.h +++ b/include/tcg/tcg-opc.h @@ -245,6 +245,9 @@ DEF(or_vec, 1, 2, 0, IMPLVEC) DEF(xor_vec, 1, 2, 0, IMPLVEC) DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec)) DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec)) +DEF(nand_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_nand_vec)) +DEF(nor_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_nor_vec)) +DEF(eqv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_eqv_vec)) DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec)) DEF(shli_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index 42f5b50..73869fd 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -43,7 +43,7 @@ #else #define MAX_OPC_PARAM_PER_ARG 1 #endif -#define MAX_OPC_PARAM_IARGS 6 +#define MAX_OPC_PARAM_IARGS 7 #define MAX_OPC_PARAM_OARGS 1 #define MAX_OPC_PARAM_ARGS (MAX_OPC_PARAM_IARGS + MAX_OPC_PARAM_OARGS) @@ -183,6 +183,9 @@ typedef uint64_t TCGRegSet; #define TCG_TARGET_HAS_not_vec 0 #define TCG_TARGET_HAS_andc_vec 0 #define TCG_TARGET_HAS_orc_vec 0 +#define TCG_TARGET_HAS_nand_vec 0 +#define TCG_TARGET_HAS_nor_vec 0 +#define TCG_TARGET_HAS_eqv_vec 0 #define TCG_TARGET_HAS_roti_vec 0 #define TCG_TARGET_HAS_rots_vec 0 #define TCG_TARGET_HAS_rotv_vec 0 |