diff options
Diffstat (limited to 'block/file-posix.c')
-rw-r--r-- | block/file-posix.c | 209 |
1 files changed, 192 insertions, 17 deletions
diff --git a/block/file-posix.c b/block/file-posix.c index 56d1972..9b5f08c 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -41,6 +41,7 @@ #include "scsi/pr-manager.h" #include "scsi/constants.h" +#include "scsi/utils.h" #if defined(__APPLE__) && (__MACH__) #include <sys/ioctl.h> @@ -72,6 +73,7 @@ #include <linux/blkzoned.h> #endif #include <linux/cdrom.h> +#include <linux/dm-ioctl.h> #include <linux/fd.h> #include <linux/fs.h> #include <linux/hdreg.h> @@ -110,6 +112,10 @@ #include <sys/diskslice.h> #endif +#ifdef EMSCRIPTEN +#include <sys/ioctl.h> +#endif + /* OS X does not have O_DSYNC */ #ifndef O_DSYNC #ifdef O_SYNC @@ -134,6 +140,22 @@ #define RAW_LOCK_PERM_BASE 100 #define RAW_LOCK_SHARED_BASE 200 +/* + * Multiple retries are mostly meant for two separate scenarios: + * + * - DM_MPATH_PROBE_PATHS returns success, but before SG_IO completes, another + * path goes down. + * + * - DM_MPATH_PROBE_PATHS failed all paths in the current path group, so we have + * to send another SG_IO to switch to another path group to probe the paths in + * it. + * + * Even if each path is in a separate path group (path_grouping_policy set to + * failover), it's rare to have more than eight path groups - and even then + * pretty unlikely that only bad path groups would be chosen in eight retries. + */ +#define SG_IO_MAX_RETRIES 8 + typedef struct BDRVRawState { int fd; bool use_lock; @@ -161,6 +183,7 @@ typedef struct BDRVRawState { bool use_linux_aio:1; bool has_laio_fdsync:1; bool use_linux_io_uring:1; + bool use_mpath:1; int page_cache_inconsistent; /* errno from fdatasync failure */ bool has_fallocate; bool needs_alignment; @@ -781,17 +804,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif - if (S_ISBLK(st.st_mode)) { -#ifdef __linux__ - /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do - * not rely on the contents of discarded blocks unless using O_DIRECT. - * Same for BLKZEROOUT. - */ - if (!(bs->open_flags & BDRV_O_NOCACHE)) { - s->has_write_zeroes = false; - } -#endif - } #ifdef __FreeBSD__ if (S_ISCHR(st.st_mode)) { /* @@ -1276,10 +1288,10 @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned) } #endif /* defined(CONFIG_BLKZONED) */ +#ifdef CONFIG_LINUX /* * Get a sysfs attribute value as a long integer. */ -#ifdef CONFIG_LINUX static long get_sysfs_long_val(struct stat *st, const char *attribute) { g_autofree char *str = NULL; @@ -1299,6 +1311,30 @@ static long get_sysfs_long_val(struct stat *st, const char *attribute) } return ret; } + +/* + * Get a sysfs attribute value as a uint32_t. + */ +static int get_sysfs_u32_val(struct stat *st, const char *attribute, + uint32_t *u32) +{ + g_autofree char *str = NULL; + const char *end; + unsigned int val; + int ret; + + ret = get_sysfs_str_val(st, attribute, &str); + if (ret < 0) { + return ret; + } + + /* The file is ended with '\n', pass 'end' to accept that. */ + ret = qemu_strtoui(str, &end, 10, &val); + if (ret == 0 && end && *end == '\0') { + *u32 = val; + } + return ret; +} #endif static int hdev_get_max_segments(int fd, struct stat *st) @@ -1318,6 +1354,23 @@ static int hdev_get_max_segments(int fd, struct stat *st) #endif } +/* + * Fills in *dalign with the discard alignment and returns 0 on success, + * -errno otherwise. + */ +static int hdev_get_pdiscard_alignment(struct stat *st, uint32_t *dalign) +{ +#ifdef CONFIG_LINUX + /* + * Note that Linux "discard_granularity" is QEMU "discard_alignment". Linux + * "discard_alignment" is something else. + */ + return get_sysfs_u32_val(st, "discard_granularity", dalign); +#else + return -ENOTSUP; +#endif +} + #if defined(CONFIG_BLKZONED) /* * If the reset_all flag is true, then the wps of zone whose state is @@ -1527,6 +1580,30 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) } } + if (S_ISBLK(st.st_mode)) { + uint32_t dalign = 0; + int ret; + + ret = hdev_get_pdiscard_alignment(&st, &dalign); + if (ret == 0 && dalign != 0) { + uint32_t ralign = bs->bl.request_alignment; + + /* Probably never happens, but handle it just in case */ + if (dalign < ralign && (ralign % dalign == 0)) { + dalign = ralign; + } + + /* The block layer requires a multiple of request_alignment */ + if (dalign % ralign != 0) { + error_setg(errp, "Invalid pdiscard_alignment limit %u is not a " + "multiple of request_alignment %u", dalign, ralign); + return; + } + + bs->bl.pdiscard_alignment = dalign; + } + } + raw_refresh_zoned_limits(bs, &st, errp); } @@ -2011,8 +2088,11 @@ static int handle_aiocb_write_zeroes_unmap(void *opaque) } #ifndef HAVE_COPY_FILE_RANGE -static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, - off_t *out_off, size_t len, unsigned int flags) +#ifndef EMSCRIPTEN +static +#endif +ssize_t copy_file_range(int in_fd, off_t *in_off, int out_fd, + off_t *out_off, size_t len, unsigned int flags) { #ifdef __NR_copy_file_range return syscall(__NR_copy_file_range, in_fd, in_off, out_fd, @@ -3201,7 +3281,7 @@ static int find_allocation(BlockDriverState *bs, off_t start, * well exceed it. */ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, - bool want_zero, + unsigned int mode, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, @@ -3217,7 +3297,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, return ret; } - if (!want_zero) { + if (!(mode & BDRV_WANT_ZERO)) { + /* There is no backing file - all bytes are allocated in this file. */ *pnum = bytes; *map = offset; *file = bs; @@ -4191,15 +4272,105 @@ hdev_open_Mac_error: /* Since this does ioctl the device must be already opened */ bs->sg = hdev_is_sg(bs); + /* sg devices aren't even block devices and can't use dm-mpath */ + s->use_mpath = !bs->sg; + return ret; } #if defined(__linux__) +#if defined(DM_MPATH_PROBE_PATHS) +static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr) +{ + if (ret < 0) { + switch (ret) { + case -ENODEV: + return true; + case -EAGAIN: + /* + * The device is probably suspended. This happens while the dm table + * is reloaded, e.g. because a path is added or removed. This is an + * operation that should complete within 1ms, so just wait a bit and + * retry. + * + * If the device was suspended for another reason, we'll wait and + * retry SG_IO_MAX_RETRIES times. This is a tolerable delay before + * we return an error and potentially stop the VM. + */ + qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000); + return true; + default: + return false; + } + } + + if (io_hdr->host_status != SCSI_HOST_OK) { + return true; + } + + switch (io_hdr->status) { + case GOOD: + case CONDITION_GOOD: + case INTERMEDIATE_GOOD: + case INTERMEDIATE_C_GOOD: + case RESERVATION_CONFLICT: + case COMMAND_TERMINATED: + return false; + case CHECK_CONDITION: + return !scsi_sense_buf_is_guest_recoverable(io_hdr->sbp, + io_hdr->mx_sb_len); + default: + return true; + } +} + +static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret) +{ + BDRVRawState *s = acb->bs->opaque; + RawPosixAIOData probe_acb; + + if (!s->use_mpath) { + return false; + } + + if (!sgio_path_error(ret, acb->ioctl.buf)) { + return false; + } + + probe_acb = (RawPosixAIOData) { + .bs = acb->bs, + .aio_type = QEMU_AIO_IOCTL, + .aio_fildes = s->fd, + .aio_offset = 0, + .ioctl = { + .buf = NULL, + .cmd = DM_MPATH_PROBE_PATHS, + }, + }; + + ret = raw_thread_pool_submit(handle_aiocb_ioctl, &probe_acb); + if (ret == -ENOTTY) { + s->use_mpath = false; + } else if (ret == -EAGAIN) { + /* The device might be suspended for a table reload, worth retrying */ + return true; + } + + return ret == 0; +} +#else +static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret) +{ + return false; +} +#endif /* DM_MPATH_PROBE_PATHS */ + static int coroutine_fn hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) { BDRVRawState *s = bs->opaque; RawPosixAIOData acb; + int retries = SG_IO_MAX_RETRIES; int ret; ret = fd_open(bs); @@ -4227,7 +4398,11 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) }, }; - return raw_thread_pool_submit(handle_aiocb_ioctl, &acb); + do { + ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb); + } while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret)); + + return ret; } #endif /* linux */ |