aboutsummaryrefslogtreecommitdiff
path: root/block/file-posix.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/file-posix.c')
-rw-r--r--block/file-posix.c257
1 files changed, 220 insertions, 37 deletions
diff --git a/block/file-posix.c b/block/file-posix.c
index be25e35..9b5f08c 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -36,11 +36,12 @@
#include "block/thread-pool.h"
#include "qemu/iov.h"
#include "block/raw-aio.h"
-#include "qapi/qmp/qdict.h"
-#include "qapi/qmp/qstring.h"
+#include "qobject/qdict.h"
+#include "qobject/qstring.h"
#include "scsi/pr-manager.h"
#include "scsi/constants.h"
+#include "scsi/utils.h"
#if defined(__APPLE__) && (__MACH__)
#include <sys/ioctl.h>
@@ -72,6 +73,7 @@
#include <linux/blkzoned.h>
#endif
#include <linux/cdrom.h>
+#include <linux/dm-ioctl.h>
#include <linux/fd.h>
#include <linux/fs.h>
#include <linux/hdreg.h>
@@ -110,6 +112,10 @@
#include <sys/diskslice.h>
#endif
+#ifdef EMSCRIPTEN
+#include <sys/ioctl.h>
+#endif
+
/* OS X does not have O_DSYNC */
#ifndef O_DSYNC
#ifdef O_SYNC
@@ -134,6 +140,22 @@
#define RAW_LOCK_PERM_BASE 100
#define RAW_LOCK_SHARED_BASE 200
+/*
+ * Multiple retries are mostly meant for two separate scenarios:
+ *
+ * - DM_MPATH_PROBE_PATHS returns success, but before SG_IO completes, another
+ * path goes down.
+ *
+ * - DM_MPATH_PROBE_PATHS failed all paths in the current path group, so we have
+ * to send another SG_IO to switch to another path group to probe the paths in
+ * it.
+ *
+ * Even if each path is in a separate path group (path_grouping_policy set to
+ * failover), it's rare to have more than eight path groups - and even then
+ * pretty unlikely that only bad path groups would be chosen in eight retries.
+ */
+#define SG_IO_MAX_RETRIES 8
+
typedef struct BDRVRawState {
int fd;
bool use_lock;
@@ -161,6 +183,7 @@ typedef struct BDRVRawState {
bool use_linux_aio:1;
bool has_laio_fdsync:1;
bool use_linux_io_uring:1;
+ bool use_mpath:1;
int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
@@ -194,6 +217,7 @@ static int fd_open(BlockDriverState *bs)
}
static int64_t raw_getlength(BlockDriverState *bs);
+static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs);
typedef struct RawPosixAIOData {
BlockDriverState *bs;
@@ -780,17 +804,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
#endif
- if (S_ISBLK(st.st_mode)) {
-#ifdef __linux__
- /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do
- * not rely on the contents of discarded blocks unless using O_DIRECT.
- * Same for BLKZEROOUT.
- */
- if (!(bs->open_flags & BDRV_O_NOCACHE)) {
- s->has_write_zeroes = false;
- }
-#endif
- }
#ifdef __FreeBSD__
if (S_ISCHR(st.st_mode)) {
/*
@@ -804,6 +817,13 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
#endif
s->needs_alignment = raw_needs_alignment(bs);
+ bs->supported_write_flags = BDRV_REQ_FUA;
+ if (s->use_linux_aio && !laio_has_fua()) {
+ bs->supported_write_flags &= ~BDRV_REQ_FUA;
+ } else if (s->use_linux_io_uring && !luring_has_fua()) {
+ bs->supported_write_flags &= ~BDRV_REQ_FUA;
+ }
+
bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
if (S_ISREG(st.st_mode)) {
/* When extending regular files, we get zeros from the OS */
@@ -1268,10 +1288,10 @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
}
#endif /* defined(CONFIG_BLKZONED) */
+#ifdef CONFIG_LINUX
/*
* Get a sysfs attribute value as a long integer.
*/
-#ifdef CONFIG_LINUX
static long get_sysfs_long_val(struct stat *st, const char *attribute)
{
g_autofree char *str = NULL;
@@ -1291,6 +1311,30 @@ static long get_sysfs_long_val(struct stat *st, const char *attribute)
}
return ret;
}
+
+/*
+ * Get a sysfs attribute value as a uint32_t.
+ */
+static int get_sysfs_u32_val(struct stat *st, const char *attribute,
+ uint32_t *u32)
+{
+ g_autofree char *str = NULL;
+ const char *end;
+ unsigned int val;
+ int ret;
+
+ ret = get_sysfs_str_val(st, attribute, &str);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* The file is ended with '\n', pass 'end' to accept that. */
+ ret = qemu_strtoui(str, &end, 10, &val);
+ if (ret == 0 && end && *end == '\0') {
+ *u32 = val;
+ }
+ return ret;
+}
#endif
static int hdev_get_max_segments(int fd, struct stat *st)
@@ -1310,6 +1354,23 @@ static int hdev_get_max_segments(int fd, struct stat *st)
#endif
}
+/*
+ * Fills in *dalign with the discard alignment and returns 0 on success,
+ * -errno otherwise.
+ */
+static int hdev_get_pdiscard_alignment(struct stat *st, uint32_t *dalign)
+{
+#ifdef CONFIG_LINUX
+ /*
+ * Note that Linux "discard_granularity" is QEMU "discard_alignment". Linux
+ * "discard_alignment" is something else.
+ */
+ return get_sysfs_u32_val(st, "discard_granularity", dalign);
+#else
+ return -ENOTSUP;
+#endif
+}
+
#if defined(CONFIG_BLKZONED)
/*
* If the reset_all flag is true, then the wps of zone whose state is
@@ -1398,7 +1459,7 @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
Error **errp)
{
BDRVRawState *s = bs->opaque;
- BlockZoneModel zoned;
+ BlockZoneModel zoned = BLK_Z_NONE;
int ret;
ret = get_sysfs_zoned_model(st, &zoned);
@@ -1519,6 +1580,30 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
}
}
+ if (S_ISBLK(st.st_mode)) {
+ uint32_t dalign = 0;
+ int ret;
+
+ ret = hdev_get_pdiscard_alignment(&st, &dalign);
+ if (ret == 0 && dalign != 0) {
+ uint32_t ralign = bs->bl.request_alignment;
+
+ /* Probably never happens, but handle it just in case */
+ if (dalign < ralign && (ralign % dalign == 0)) {
+ dalign = ralign;
+ }
+
+ /* The block layer requires a multiple of request_alignment */
+ if (dalign % ralign != 0) {
+ error_setg(errp, "Invalid pdiscard_alignment limit %u is not a "
+ "multiple of request_alignment %u", dalign, ralign);
+ return;
+ }
+
+ bs->bl.pdiscard_alignment = dalign;
+ }
+ }
+
raw_refresh_zoned_limits(bs, &st, errp);
}
@@ -2003,8 +2088,11 @@ static int handle_aiocb_write_zeroes_unmap(void *opaque)
}
#ifndef HAVE_COPY_FILE_RANGE
-static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
- off_t *out_off, size_t len, unsigned int flags)
+#ifndef EMSCRIPTEN
+static
+#endif
+ssize_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
+ off_t *out_off, size_t len, unsigned int flags)
{
#ifdef __NR_copy_file_range
return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
@@ -2477,7 +2565,8 @@ static inline bool raw_check_linux_aio(BDRVRawState *s)
#endif
static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
- uint64_t bytes, QEMUIOVector *qiov, int type)
+ uint64_t bytes, QEMUIOVector *qiov, int type,
+ int flags)
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
@@ -2508,13 +2597,13 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
#ifdef CONFIG_LINUX_IO_URING
} else if (raw_check_linux_io_uring(s)) {
assert(qiov->size == bytes);
- ret = luring_co_submit(bs, s->fd, offset, qiov, type);
+ ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
goto out;
#endif
#ifdef CONFIG_LINUX_AIO
} else if (raw_check_linux_aio(s)) {
assert(qiov->size == bytes);
- ret = laio_co_submit(s->fd, offset, qiov, type,
+ ret = laio_co_submit(s->fd, offset, qiov, type, flags,
s->aio_max_batch);
goto out;
#endif
@@ -2534,6 +2623,10 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
assert(qiov->size == bytes);
ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
+ if (ret == 0 && (flags & BDRV_REQ_FUA)) {
+ /* TODO Use pwritev2() instead if it's available */
+ ret = raw_co_flush_to_disk(bs);
+ }
goto out; /* Avoid the compiler err of unused label */
out:
@@ -2571,14 +2664,14 @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags)
{
- return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ);
+ return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ, flags);
}
static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags)
{
- return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE);
+ return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE, flags);
}
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
@@ -2600,12 +2693,12 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
#ifdef CONFIG_LINUX_IO_URING
if (raw_check_linux_io_uring(s)) {
- return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH);
+ return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
}
#endif
#ifdef CONFIG_LINUX_AIO
if (s->has_laio_fdsync && raw_check_linux_aio(s)) {
- return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
+ return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0, 0);
}
#endif
return raw_thread_pool_submit(handle_aiocb_flush, &acb);
@@ -3188,7 +3281,7 @@ static int find_allocation(BlockDriverState *bs, off_t start,
* well exceed it.
*/
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
- bool want_zero,
+ unsigned int mode,
int64_t offset,
int64_t bytes, int64_t *pnum,
int64_t *map,
@@ -3204,7 +3297,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
return ret;
}
- if (!want_zero) {
+ if (!(mode & BDRV_WANT_ZERO)) {
+ /* There is no backing file - all bytes are allocated in this file. */
*pnum = bytes;
*map = offset;
*file = bs;
@@ -3540,7 +3634,7 @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
}
trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
- return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+ return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND, 0);
}
#endif
@@ -3886,7 +3980,7 @@ BlockDriver bdrv_file = {
.bdrv_needs_filename = true,
.bdrv_probe = NULL, /* no probe for protocols */
.bdrv_parse_filename = raw_parse_filename,
- .bdrv_file_open = raw_open,
+ .bdrv_open = raw_open,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
@@ -3929,11 +4023,6 @@ BlockDriver bdrv_file = {
static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
CFIndex maxPathSize, int flags);
-#if !defined(MAC_OS_VERSION_12_0) \
- || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_VERSION_12_0)
-#define IOMainPort IOMasterPort
-#endif
-
static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
{
kern_return_t kernResult = KERN_FAILURE;
@@ -4183,15 +4272,105 @@ hdev_open_Mac_error:
/* Since this does ioctl the device must be already opened */
bs->sg = hdev_is_sg(bs);
+ /* sg devices aren't even block devices and can't use dm-mpath */
+ s->use_mpath = !bs->sg;
+
return ret;
}
#if defined(__linux__)
+#if defined(DM_MPATH_PROBE_PATHS)
+static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr)
+{
+ if (ret < 0) {
+ switch (ret) {
+ case -ENODEV:
+ return true;
+ case -EAGAIN:
+ /*
+ * The device is probably suspended. This happens while the dm table
+ * is reloaded, e.g. because a path is added or removed. This is an
+ * operation that should complete within 1ms, so just wait a bit and
+ * retry.
+ *
+ * If the device was suspended for another reason, we'll wait and
+ * retry SG_IO_MAX_RETRIES times. This is a tolerable delay before
+ * we return an error and potentially stop the VM.
+ */
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ if (io_hdr->host_status != SCSI_HOST_OK) {
+ return true;
+ }
+
+ switch (io_hdr->status) {
+ case GOOD:
+ case CONDITION_GOOD:
+ case INTERMEDIATE_GOOD:
+ case INTERMEDIATE_C_GOOD:
+ case RESERVATION_CONFLICT:
+ case COMMAND_TERMINATED:
+ return false;
+ case CHECK_CONDITION:
+ return !scsi_sense_buf_is_guest_recoverable(io_hdr->sbp,
+ io_hdr->mx_sb_len);
+ default:
+ return true;
+ }
+}
+
+static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
+{
+ BDRVRawState *s = acb->bs->opaque;
+ RawPosixAIOData probe_acb;
+
+ if (!s->use_mpath) {
+ return false;
+ }
+
+ if (!sgio_path_error(ret, acb->ioctl.buf)) {
+ return false;
+ }
+
+ probe_acb = (RawPosixAIOData) {
+ .bs = acb->bs,
+ .aio_type = QEMU_AIO_IOCTL,
+ .aio_fildes = s->fd,
+ .aio_offset = 0,
+ .ioctl = {
+ .buf = NULL,
+ .cmd = DM_MPATH_PROBE_PATHS,
+ },
+ };
+
+ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &probe_acb);
+ if (ret == -ENOTTY) {
+ s->use_mpath = false;
+ } else if (ret == -EAGAIN) {
+ /* The device might be suspended for a table reload, worth retrying */
+ return true;
+ }
+
+ return ret == 0;
+}
+#else
+static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
+{
+ return false;
+}
+#endif /* DM_MPATH_PROBE_PATHS */
+
static int coroutine_fn
hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
+ int retries = SG_IO_MAX_RETRIES;
int ret;
ret = fd_open(bs);
@@ -4219,7 +4398,11 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
},
};
- return raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
+ do {
+ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
+ } while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret));
+
+ return ret;
}
#endif /* linux */
@@ -4257,7 +4440,7 @@ static BlockDriver bdrv_host_device = {
.bdrv_needs_filename = true,
.bdrv_probe_device = hdev_probe_device,
.bdrv_parse_filename = hdev_parse_filename,
- .bdrv_file_open = hdev_open,
+ .bdrv_open = hdev_open,
.bdrv_close = raw_close,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
@@ -4396,7 +4579,7 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_needs_filename = true,
.bdrv_probe_device = cdrom_probe_device,
.bdrv_parse_filename = cdrom_parse_filename,
- .bdrv_file_open = cdrom_open,
+ .bdrv_open = cdrom_open,
.bdrv_close = raw_close,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
@@ -4522,7 +4705,7 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_needs_filename = true,
.bdrv_probe_device = cdrom_probe_device,
.bdrv_parse_filename = cdrom_parse_filename,
- .bdrv_file_open = cdrom_open,
+ .bdrv_open = cdrom_open,
.bdrv_close = raw_close,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,