aboutsummaryrefslogtreecommitdiff
path: root/block/file-posix.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/file-posix.c')
-rw-r--r--block/file-posix.c160
1 files changed, 132 insertions, 28 deletions
diff --git a/block/file-posix.c b/block/file-posix.c
index ef52ed9..8c73867 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -41,6 +41,7 @@
#include "scsi/pr-manager.h"
#include "scsi/constants.h"
+#include "scsi/utils.h"
#if defined(__APPLE__) && (__MACH__)
#include <sys/ioctl.h>
@@ -72,6 +73,7 @@
#include <linux/blkzoned.h>
#endif
#include <linux/cdrom.h>
+#include <linux/dm-ioctl.h>
#include <linux/fd.h>
#include <linux/fs.h>
#include <linux/hdreg.h>
@@ -138,6 +140,22 @@
#define RAW_LOCK_PERM_BASE 100
#define RAW_LOCK_SHARED_BASE 200
+/*
+ * Multiple retries are mostly meant for two separate scenarios:
+ *
+ * - DM_MPATH_PROBE_PATHS returns success, but before SG_IO completes, another
+ * path goes down.
+ *
+ * - DM_MPATH_PROBE_PATHS failed all paths in the current path group, so we have
+ * to send another SG_IO to switch to another path group to probe the paths in
+ * it.
+ *
+ * Even if each path is in a separate path group (path_grouping_policy set to
+ * failover), it's rare to have more than eight path groups - and even then
+ * pretty unlikely that only bad path groups would be chosen in eight retries.
+ */
+#define SG_IO_MAX_RETRIES 8
+
typedef struct BDRVRawState {
int fd;
bool use_lock;
@@ -165,6 +183,7 @@ typedef struct BDRVRawState {
bool use_linux_aio:1;
bool has_laio_fdsync:1;
bool use_linux_io_uring:1;
+ bool use_mpath:1;
int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
@@ -785,17 +804,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
#endif
- if (S_ISBLK(st.st_mode)) {
-#ifdef __linux__
- /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do
- * not rely on the contents of discarded blocks unless using O_DIRECT.
- * Same for BLKZEROOUT.
- */
- if (!(bs->open_flags & BDRV_O_NOCACHE)) {
- s->has_write_zeroes = false;
- }
-#endif
- }
#ifdef __FreeBSD__
if (S_ISCHR(st.st_mode)) {
/*
@@ -2556,9 +2564,9 @@ static inline bool raw_check_linux_aio(BDRVRawState *s)
}
#endif
-static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
- uint64_t bytes, QEMUIOVector *qiov, int type,
- int flags)
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, uint64_t bytes,
+ QEMUIOVector *qiov, int type, int flags)
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
@@ -2617,7 +2625,7 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
if (ret == 0 && (flags & BDRV_REQ_FUA)) {
/* TODO Use pwritev2() instead if it's available */
- ret = raw_co_flush_to_disk(bs);
+ ret = bdrv_co_flush(bs);
}
goto out; /* Avoid the compiler err of unused label */
@@ -2652,16 +2660,16 @@ out:
return ret;
}
-static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
- int64_t bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
+ QEMUIOVector *qiov, BdrvRequestFlags flags)
{
return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ, flags);
}
-static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
- int64_t bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
+ QEMUIOVector *qiov, BdrvRequestFlags flags)
{
return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE, flags);
}
@@ -3273,7 +3281,7 @@ static int find_allocation(BlockDriverState *bs, off_t start,
* well exceed it.
*/
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
- bool want_zero,
+ unsigned int mode,
int64_t offset,
int64_t bytes, int64_t *pnum,
int64_t *map,
@@ -3289,7 +3297,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
return ret;
}
- if (!want_zero) {
+ if (!(mode & BDRV_WANT_ZERO)) {
+ /* There is no backing file - all bytes are allocated in this file. */
*pnum = bytes;
*map = offset;
*file = bs;
@@ -3597,10 +3606,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
#endif
#if defined(CONFIG_BLKZONED)
-static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
- int64_t *offset,
- QEMUIOVector *qiov,
- BdrvRequestFlags flags) {
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_append(BlockDriverState *bs,
+ int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags) {
assert(flags == 0);
int64_t zone_size_mask = bs->bl.zone_size - 1;
int64_t iov_len = 0;
@@ -4263,15 +4273,105 @@ hdev_open_Mac_error:
/* Since this does ioctl the device must be already opened */
bs->sg = hdev_is_sg(bs);
+ /* sg devices aren't even block devices and can't use dm-mpath */
+ s->use_mpath = !bs->sg;
+
return ret;
}
#if defined(__linux__)
+#if defined(DM_MPATH_PROBE_PATHS)
+static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr)
+{
+ if (ret < 0) {
+ switch (ret) {
+ case -ENODEV:
+ return true;
+ case -EAGAIN:
+ /*
+ * The device is probably suspended. This happens while the dm table
+ * is reloaded, e.g. because a path is added or removed. This is an
+ * operation that should complete within 1ms, so just wait a bit and
+ * retry.
+ *
+ * If the device was suspended for another reason, we'll wait and
+ * retry SG_IO_MAX_RETRIES times. This is a tolerable delay before
+ * we return an error and potentially stop the VM.
+ */
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ if (io_hdr->host_status != SCSI_HOST_OK) {
+ return true;
+ }
+
+ switch (io_hdr->status) {
+ case GOOD:
+ case CONDITION_GOOD:
+ case INTERMEDIATE_GOOD:
+ case INTERMEDIATE_C_GOOD:
+ case RESERVATION_CONFLICT:
+ case COMMAND_TERMINATED:
+ return false;
+ case CHECK_CONDITION:
+ return !scsi_sense_buf_is_guest_recoverable(io_hdr->sbp,
+ io_hdr->mx_sb_len);
+ default:
+ return true;
+ }
+}
+
+static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
+{
+ BDRVRawState *s = acb->bs->opaque;
+ RawPosixAIOData probe_acb;
+
+ if (!s->use_mpath) {
+ return false;
+ }
+
+ if (!sgio_path_error(ret, acb->ioctl.buf)) {
+ return false;
+ }
+
+ probe_acb = (RawPosixAIOData) {
+ .bs = acb->bs,
+ .aio_type = QEMU_AIO_IOCTL,
+ .aio_fildes = s->fd,
+ .aio_offset = 0,
+ .ioctl = {
+ .buf = NULL,
+ .cmd = DM_MPATH_PROBE_PATHS,
+ },
+ };
+
+ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &probe_acb);
+ if (ret == -ENOTTY) {
+ s->use_mpath = false;
+ } else if (ret == -EAGAIN) {
+ /* The device might be suspended for a table reload, worth retrying */
+ return true;
+ }
+
+ return ret == 0;
+}
+#else
+static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
+{
+ return false;
+}
+#endif /* DM_MPATH_PROBE_PATHS */
+
static int coroutine_fn
hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
+ int retries = SG_IO_MAX_RETRIES;
int ret;
ret = fd_open(bs);
@@ -4299,7 +4399,11 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
},
};
- return raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
+ do {
+ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
+ } while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret));
+
+ return ret;
}
#endif /* linux */