diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2018-07-05 15:53:04 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2018-07-05 15:53:04 +0100 |
commit | 1daf14ec9e66cbe3cbfa74ba17c6cdd9cc6d8e07 (patch) | |
tree | 9cf43b52ae3de9a2bc94a1bdeae5d60a148d6395 /block | |
parent | 6cf495be0b445789eeb7e88a6015c8cf74d4c1cf (diff) | |
parent | 7c20c808a5cbf5d244735bc78fc3138c739c1946 (diff) | |
download | qemu-1daf14ec9e66cbe3cbfa74ba17c6cdd9cc6d8e07.zip qemu-1daf14ec9e66cbe3cbfa74ba17c6cdd9cc6d8e07.tar.gz qemu-1daf14ec9e66cbe3cbfa74ba17c6cdd9cc6d8e07.tar.bz2 |
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
Block layer patches:
- qcow2: Use worker threads for compression to improve performance of
'qemu-img convert -W' and compressed backup jobs
- blklogwrites: New filter driver to log write requests to an image in
the dm-log-writes format
- file-posix: Fix image locking during image creation
- crypto: Fix memory leak in error path
- Error out instead of silently truncating node names
# gpg: Signature made Thu 05 Jul 2018 11:24:33 BST
# gpg: using RSA key 7F09B272C88F2FD6
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>"
# Primary key fingerprint: DC3D EB15 9A9A F95D 3D74 56FE 7F09 B272 C88F 2FD6
* remotes/kevin/tags/for-upstream:
file-posix: Unlock FD after creation
file-posix: Fix creation locking
block/blklogwrites: Add an option for the update interval of the log superblock
block/blklogwrites: Add an option for appending to an old log
block/blklogwrites: Change log_sector_size from int64_t to uint64_t
block/crypto: Fix memory leak in create error path
block: Don't silently truncate node names
block: Add blklogwrites
block: Move two block permission constants to the relevant enum
qcow2: add compress threads
qcow2: refactor data compression
qemu-img: allow compressed not-in-order writes
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile.objs | 1 | ||||
-rw-r--r-- | block/blklogwrites.c | 547 | ||||
-rw-r--r-- | block/crypto.c | 2 | ||||
-rw-r--r-- | block/file-posix.c | 21 | ||||
-rw-r--r-- | block/qcow2.c | 138 | ||||
-rw-r--r-- | block/qcow2.h | 3 |
6 files changed, 679 insertions, 33 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs index 899bfb5..c8337bf 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -5,6 +5,7 @@ block-obj-y += qed-check.o block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o block-obj-y += quorum.o block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o +block-obj-y += blklogwrites.o block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += file-posix.o diff --git a/block/blklogwrites.c b/block/blklogwrites.c new file mode 100644 index 0000000..63bf6b3 --- /dev/null +++ b/block/blklogwrites.c @@ -0,0 +1,547 @@ +/* + * Write logging blk driver based on blkverify and blkdebug. + * + * Copyright (c) 2017 Tuomas Tynkkynen <tuomas@tuxera.com> + * Copyright (c) 2018 Aapo Vienamo <aapo@tuxera.com> + * Copyright (c) 2018 Ari Sundholm <ari@tuxera.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ +#include "block/block_int.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qstring.h" +#include "qemu/cutils.h" +#include "qemu/option.h" + +/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */ + +#define LOG_FLUSH_FLAG (1 << 0) +#define LOG_FUA_FLAG (1 << 1) +#define LOG_DISCARD_FLAG (1 << 2) +#define LOG_MARK_FLAG (1 << 3) +#define LOG_FLAG_MASK (LOG_FLUSH_FLAG \ + | LOG_FUA_FLAG \ + | LOG_DISCARD_FLAG \ + | LOG_MARK_FLAG) + +#define WRITE_LOG_VERSION 1ULL +#define WRITE_LOG_MAGIC 0x6a736677736872ULL + +/* All fields are little-endian. */ +struct log_write_super { + uint64_t magic; + uint64_t version; + uint64_t nr_entries; + uint32_t sectorsize; +} QEMU_PACKED; + +struct log_write_entry { + uint64_t sector; + uint64_t nr_sectors; + uint64_t flags; + uint64_t data_len; +} QEMU_PACKED; + +/* End of disk format structures. */ + +typedef struct { + BdrvChild *log_file; + uint32_t sectorsize; + uint32_t sectorbits; + uint64_t cur_log_sector; + uint64_t nr_entries; + uint64_t update_interval; +} BDRVBlkLogWritesState; + +static QemuOptsList runtime_opts = { + .name = "blklogwrites", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "log-append", + .type = QEMU_OPT_BOOL, + .help = "Append to an existing log", + }, + { + .name = "log-sector-size", + .type = QEMU_OPT_SIZE, + .help = "Log sector size", + }, + { + .name = "log-super-update-interval", + .type = QEMU_OPT_NUMBER, + .help = "Log superblock update interval (# of write requests)", + }, + { /* end of list */ } + }, +}; + +static inline uint32_t blk_log_writes_log2(uint32_t value) +{ + assert(value > 0); + return 31 - clz32(value); +} + +static inline bool blk_log_writes_sector_size_valid(uint32_t sector_size) +{ + return sector_size < (1ull << 24) && is_power_of_2(sector_size); +} + +static uint64_t blk_log_writes_find_cur_log_sector(BdrvChild *log, + uint32_t sector_size, + uint64_t nr_entries, + Error **errp) +{ + uint64_t cur_sector = 1; + uint64_t cur_idx = 0; + uint32_t sector_bits = blk_log_writes_log2(sector_size); + struct log_write_entry cur_entry; + + while (cur_idx < nr_entries) { + int read_ret = bdrv_pread(log, cur_sector << sector_bits, &cur_entry, + sizeof(cur_entry)); + if (read_ret < 0) { + error_setg_errno(errp, -read_ret, + "Failed to read log entry %"PRIu64, cur_idx); + return (uint64_t)-1ull; + } + + if (cur_entry.flags & ~cpu_to_le64(LOG_FLAG_MASK)) { + error_setg(errp, "Invalid flags 0x%"PRIx64" in log entry %"PRIu64, + le64_to_cpu(cur_entry.flags), cur_idx); + return (uint64_t)-1ull; + } + + /* Account for the sector of the entry itself */ + ++cur_sector; + + /* + * Account for the data of the write. + * For discards, this data is not present. + */ + if (!(cur_entry.flags & cpu_to_le64(LOG_DISCARD_FLAG))) { + cur_sector += le64_to_cpu(cur_entry.nr_sectors); + } + + ++cur_idx; + } + + return cur_sector; +} + +static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVBlkLogWritesState *s = bs->opaque; + QemuOpts *opts; + Error *local_err = NULL; + int ret; + uint64_t log_sector_size; + bool log_append; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + ret = -EINVAL; + error_propagate(errp, local_err); + goto fail; + } + + /* Open the file */ + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false, + &local_err); + if (local_err) { + ret = -EINVAL; + error_propagate(errp, local_err); + goto fail; + } + + /* Open the log file */ + s->log_file = bdrv_open_child(NULL, options, "log", bs, &child_file, false, + &local_err); + if (local_err) { + ret = -EINVAL; + error_propagate(errp, local_err); + goto fail; + } + + log_append = qemu_opt_get_bool(opts, "log-append", false); + + if (log_append) { + struct log_write_super log_sb = { 0, 0, 0, 0 }; + + if (qemu_opt_find(opts, "log-sector-size")) { + ret = -EINVAL; + error_setg(errp, "log-append and log-sector-size are mutually " + "exclusive"); + goto fail_log; + } + + /* Read log superblock or fake one for an empty log */ + if (!bdrv_getlength(s->log_file->bs)) { + log_sb.magic = cpu_to_le64(WRITE_LOG_MAGIC); + log_sb.version = cpu_to_le64(WRITE_LOG_VERSION); + log_sb.nr_entries = cpu_to_le64(0); + log_sb.sectorsize = cpu_to_le32(BDRV_SECTOR_SIZE); + } else { + ret = bdrv_pread(s->log_file, 0, &log_sb, sizeof(log_sb)); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read log superblock"); + goto fail_log; + } + } + + if (log_sb.magic != cpu_to_le64(WRITE_LOG_MAGIC)) { + ret = -EINVAL; + error_setg(errp, "Invalid log superblock magic"); + goto fail_log; + } + + if (log_sb.version != cpu_to_le64(WRITE_LOG_VERSION)) { + ret = -EINVAL; + error_setg(errp, "Unsupported log version %"PRIu64, + le64_to_cpu(log_sb.version)); + goto fail_log; + } + + log_sector_size = le32_to_cpu(log_sb.sectorsize); + s->cur_log_sector = 1; + s->nr_entries = 0; + + if (blk_log_writes_sector_size_valid(log_sector_size)) { + s->cur_log_sector = + blk_log_writes_find_cur_log_sector(s->log_file, log_sector_size, + le64_to_cpu(log_sb.nr_entries), &local_err); + if (local_err) { + ret = -EINVAL; + error_propagate(errp, local_err); + goto fail_log; + } + + s->nr_entries = le64_to_cpu(log_sb.nr_entries); + } + } else { + log_sector_size = qemu_opt_get_size(opts, "log-sector-size", + BDRV_SECTOR_SIZE); + s->cur_log_sector = 1; + s->nr_entries = 0; + } + + if (!blk_log_writes_sector_size_valid(log_sector_size)) { + ret = -EINVAL; + error_setg(errp, "Invalid log sector size %"PRIu64, log_sector_size); + goto fail_log; + } + + s->sectorsize = log_sector_size; + s->sectorbits = blk_log_writes_log2(log_sector_size); + s->update_interval = qemu_opt_get_number(opts, "log-super-update-interval", + 4096); + if (!s->update_interval) { + ret = -EINVAL; + error_setg(errp, "Invalid log superblock update interval %"PRIu64, + s->update_interval); + goto fail_log; + } + + ret = 0; +fail_log: + if (ret < 0) { + bdrv_unref_child(bs, s->log_file); + s->log_file = NULL; + } +fail: + if (ret < 0) { + bdrv_unref_child(bs, bs->file); + bs->file = NULL; + } + qemu_opts_del(opts); + return ret; +} + +static void blk_log_writes_close(BlockDriverState *bs) +{ + BDRVBlkLogWritesState *s = bs->opaque; + + bdrv_unref_child(bs, s->log_file); + s->log_file = NULL; +} + +static int64_t blk_log_writes_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file->bs); +} + +static void blk_log_writes_refresh_filename(BlockDriverState *bs, + QDict *options) +{ + BDRVBlkLogWritesState *s = bs->opaque; + + /* bs->file->bs has already been refreshed */ + bdrv_refresh_filename(s->log_file->bs); + + if (bs->file->bs->full_open_options + && s->log_file->bs->full_open_options) + { + QDict *opts = qdict_new(); + qdict_put_str(opts, "driver", "blklogwrites"); + + qobject_ref(bs->file->bs->full_open_options); + qdict_put_obj(opts, "file", QOBJECT(bs->file->bs->full_open_options)); + qobject_ref(s->log_file->bs->full_open_options); + qdict_put_obj(opts, "log", + QOBJECT(s->log_file->bs->full_open_options)); + qdict_put_int(opts, "log-sector-size", s->sectorsize); + + bs->full_open_options = opts; + } +} + +static void blk_log_writes_child_perm(BlockDriverState *bs, BdrvChild *c, + const BdrvChildRole *role, + BlockReopenQueue *ro_q, + uint64_t perm, uint64_t shrd, + uint64_t *nperm, uint64_t *nshrd) +{ + if (!c) { + *nperm = perm & DEFAULT_PERM_PASSTHROUGH; + *nshrd = (shrd & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED; + return; + } + + if (!strcmp(c->name, "log")) { + bdrv_format_default_perms(bs, c, role, ro_q, perm, shrd, nperm, nshrd); + } else { + bdrv_filter_default_perms(bs, c, role, ro_q, perm, shrd, nperm, nshrd); + } +} + +static void blk_log_writes_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BDRVBlkLogWritesState *s = bs->opaque; + bs->bl.request_alignment = s->sectorsize; +} + +static int coroutine_fn +blk_log_writes_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); +} + +typedef struct BlkLogWritesFileReq { + BlockDriverState *bs; + uint64_t offset; + uint64_t bytes; + int file_flags; + QEMUIOVector *qiov; + int (*func)(struct BlkLogWritesFileReq *r); + int file_ret; +} BlkLogWritesFileReq; + +typedef struct { + BlockDriverState *bs; + QEMUIOVector *qiov; + struct log_write_entry entry; + uint64_t zero_size; + int log_ret; +} BlkLogWritesLogReq; + +static void coroutine_fn blk_log_writes_co_do_log(BlkLogWritesLogReq *lr) +{ + BDRVBlkLogWritesState *s = lr->bs->opaque; + uint64_t cur_log_offset = s->cur_log_sector << s->sectorbits; + + s->nr_entries++; + s->cur_log_sector += + ROUND_UP(lr->qiov->size, s->sectorsize) >> s->sectorbits; + + lr->log_ret = bdrv_co_pwritev(s->log_file, cur_log_offset, lr->qiov->size, + lr->qiov, 0); + + /* Logging for the "write zeroes" operation */ + if (lr->log_ret == 0 && lr->zero_size) { + cur_log_offset = s->cur_log_sector << s->sectorbits; + s->cur_log_sector += + ROUND_UP(lr->zero_size, s->sectorsize) >> s->sectorbits; + + lr->log_ret = bdrv_co_pwrite_zeroes(s->log_file, cur_log_offset, + lr->zero_size, 0); + } + + /* Update super block on flush or every update interval */ + if (lr->log_ret == 0 && ((lr->entry.flags & LOG_FLUSH_FLAG) + || (s->nr_entries % s->update_interval == 0))) + { + struct log_write_super super = { + .magic = cpu_to_le64(WRITE_LOG_MAGIC), + .version = cpu_to_le64(WRITE_LOG_VERSION), + .nr_entries = cpu_to_le64(s->nr_entries), + .sectorsize = cpu_to_le32(s->sectorsize), + }; + void *zeroes = g_malloc0(s->sectorsize - sizeof(super)); + QEMUIOVector qiov; + + qemu_iovec_init(&qiov, 2); + qemu_iovec_add(&qiov, &super, sizeof(super)); + qemu_iovec_add(&qiov, zeroes, s->sectorsize - sizeof(super)); + + lr->log_ret = + bdrv_co_pwritev(s->log_file, 0, s->sectorsize, &qiov, 0); + if (lr->log_ret == 0) { + lr->log_ret = bdrv_co_flush(s->log_file->bs); + } + qemu_iovec_destroy(&qiov); + g_free(zeroes); + } +} + +static void coroutine_fn blk_log_writes_co_do_file(BlkLogWritesFileReq *fr) +{ + fr->file_ret = fr->func(fr); +} + +static int coroutine_fn +blk_log_writes_co_log(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags, + int (*file_func)(BlkLogWritesFileReq *r), + uint64_t entry_flags, bool is_zero_write) +{ + QEMUIOVector log_qiov; + size_t niov = qiov ? qiov->niov : 0; + BDRVBlkLogWritesState *s = bs->opaque; + BlkLogWritesFileReq fr = { + .bs = bs, + .offset = offset, + .bytes = bytes, + .file_flags = flags, + .qiov = qiov, + .func = file_func, + }; + BlkLogWritesLogReq lr = { + .bs = bs, + .qiov = &log_qiov, + .entry = { + .sector = cpu_to_le64(offset >> s->sectorbits), + .nr_sectors = cpu_to_le64(bytes >> s->sectorbits), + .flags = cpu_to_le64(entry_flags), + .data_len = 0, + }, + .zero_size = is_zero_write ? bytes : 0, + }; + void *zeroes = g_malloc0(s->sectorsize - sizeof(lr.entry)); + + assert((1 << s->sectorbits) == s->sectorsize); + assert(bs->bl.request_alignment == s->sectorsize); + assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)); + assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment)); + + qemu_iovec_init(&log_qiov, niov + 2); + qemu_iovec_add(&log_qiov, &lr.entry, sizeof(lr.entry)); + qemu_iovec_add(&log_qiov, zeroes, s->sectorsize - sizeof(lr.entry)); + if (qiov) { + qemu_iovec_concat(&log_qiov, qiov, 0, qiov->size); + } + + blk_log_writes_co_do_file(&fr); + blk_log_writes_co_do_log(&lr); + + qemu_iovec_destroy(&log_qiov); + g_free(zeroes); + + if (lr.log_ret < 0) { + return lr.log_ret; + } + + return fr.file_ret; +} + +static int coroutine_fn +blk_log_writes_co_do_file_pwritev(BlkLogWritesFileReq *fr) +{ + return bdrv_co_pwritev(fr->bs->file, fr->offset, fr->bytes, + fr->qiov, fr->file_flags); +} + +static int coroutine_fn +blk_log_writes_co_do_file_pwrite_zeroes(BlkLogWritesFileReq *fr) +{ + return bdrv_co_pwrite_zeroes(fr->bs->file, fr->offset, fr->bytes, + fr->file_flags); +} + +static int coroutine_fn blk_log_writes_co_do_file_flush(BlkLogWritesFileReq *fr) +{ + return bdrv_co_flush(fr->bs->file->bs); +} + +static int coroutine_fn +blk_log_writes_co_do_file_pdiscard(BlkLogWritesFileReq *fr) +{ + return bdrv_co_pdiscard(fr->bs->file->bs, fr->offset, fr->bytes); +} + +static int coroutine_fn +blk_log_writes_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + return blk_log_writes_co_log(bs, offset, bytes, qiov, flags, + blk_log_writes_co_do_file_pwritev, 0, false); +} + +static int coroutine_fn +blk_log_writes_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, + BdrvRequestFlags flags) +{ + return blk_log_writes_co_log(bs, offset, bytes, NULL, flags, + blk_log_writes_co_do_file_pwrite_zeroes, 0, + true); +} + +static int coroutine_fn blk_log_writes_co_flush_to_disk(BlockDriverState *bs) +{ + return blk_log_writes_co_log(bs, 0, 0, NULL, 0, + blk_log_writes_co_do_file_flush, + LOG_FLUSH_FLAG, false); +} + +static int coroutine_fn +blk_log_writes_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) +{ + return blk_log_writes_co_log(bs, offset, count, NULL, 0, + blk_log_writes_co_do_file_pdiscard, + LOG_DISCARD_FLAG, false); +} + +static BlockDriver bdrv_blk_log_writes = { + .format_name = "blklogwrites", + .instance_size = sizeof(BDRVBlkLogWritesState), + + .bdrv_open = blk_log_writes_open, + .bdrv_close = blk_log_writes_close, + .bdrv_getlength = blk_log_writes_getlength, + .bdrv_refresh_filename = blk_log_writes_refresh_filename, + .bdrv_child_perm = blk_log_writes_child_perm, + .bdrv_refresh_limits = blk_log_writes_refresh_limits, + + .bdrv_co_preadv = blk_log_writes_co_preadv, + .bdrv_co_pwritev = blk_log_writes_co_pwritev, + .bdrv_co_pwrite_zeroes = blk_log_writes_co_pwrite_zeroes, + .bdrv_co_flush_to_disk = blk_log_writes_co_flush_to_disk, + .bdrv_co_pdiscard = blk_log_writes_co_pdiscard, + .bdrv_co_block_status = bdrv_co_block_status_from_file, + + .is_filter = true, +}; + +static void bdrv_blk_log_writes_init(void) +{ + bdrv_register(&bdrv_blk_log_writes); +} + +block_init(bdrv_blk_log_writes_init); diff --git a/block/crypto.c b/block/crypto.c index 994172a..146d81c 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -551,7 +551,7 @@ static int coroutine_fn block_crypto_co_create_opts_luks(const char *filename, /* Create protocol layer */ ret = bdrv_create_file(filename, opts, errp); if (ret < 0) { - return ret; + goto fail; } bs = bdrv_open(filename, NULL, NULL, diff --git a/block/file-posix.c b/block/file-posix.c index 829ee53..98987b8 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2111,8 +2111,9 @@ static int coroutine_fn raw_co_create(BlockdevCreateOptions *options, Error **errp) { BlockdevCreateOptionsFile *file_opts; + Error *local_err = NULL; int fd; - int perm, shared; + uint64_t perm, shared; int result = 0; /* Validate options and set default values */ @@ -2148,7 +2149,7 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE; /* Step one: Take locks */ - result = raw_apply_lock_bytes(fd, perm, shared, false, errp); + result = raw_apply_lock_bytes(fd, perm, ~shared, false, errp); if (result < 0) { goto out_close; } @@ -2156,13 +2157,13 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) /* Step two: Check that nobody else has taken conflicting locks */ result = raw_check_lock_bytes(fd, perm, shared, errp); if (result < 0) { - goto out_close; + goto out_unlock; } /* Clear the file by truncating it to 0 */ result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp); if (result < 0) { - goto out_close; + goto out_unlock; } if (file_opts->nocow) { @@ -2185,7 +2186,17 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) result = raw_regular_truncate(NULL, fd, file_opts->size, file_opts->preallocation, errp); if (result < 0) { - goto out_close; + goto out_unlock; + } + +out_unlock: + raw_apply_lock_bytes(fd, 0, 0, true, &local_err); + if (local_err) { + /* The above call should not fail, and if it does, that does + * not mean the whole creation operation has failed. So + * report it the user for their convenience, but do not report + * it to the caller. */ + error_report_err(local_err); } out_close: diff --git a/block/qcow2.c b/block/qcow2.c index 2f9e58e..33b61b7 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -23,11 +23,14 @@ */ #include "qemu/osdep.h" + +#define ZLIB_CONST +#include <zlib.h> + #include "block/block_int.h" #include "block/qdict.h" #include "sysemu/block-backend.h" #include "qemu/module.h" -#include <zlib.h> #include "qcow2.h" #include "qemu/error-report.h" #include "qapi/error.h" @@ -41,6 +44,7 @@ #include "qapi/qobject-input-visitor.h" #include "qapi/qapi-visit-block-core.h" #include "crypto.h" +#include "block/thread-pool.h" /* Differences with QCOW: @@ -1541,6 +1545,9 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options, qcow2_check_refcounts(bs, &result, 0); } #endif + + qemu_co_queue_init(&s->compress_wait_queue); + return ret; fail: @@ -3650,6 +3657,104 @@ fail: return ret; } +/* + * qcow2_compress() + * + * @dest - destination buffer, at least of @size-1 bytes + * @src - source buffer, @size bytes + * + * Returns: compressed size on success + * -1 if compression is inefficient + * -2 on any other error + */ +static ssize_t qcow2_compress(void *dest, const void *src, size_t size) +{ + ssize_t ret; + z_stream strm; + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, + -12, 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + return -2; + } + + /* strm.next_in is not const in old zlib versions, such as those used on + * OpenBSD/NetBSD, so cast the const away */ + strm.avail_in = size; + strm.next_in = (void *) src; + strm.avail_out = size - 1; + strm.next_out = dest; + + ret = deflate(&strm, Z_FINISH); + if (ret == Z_STREAM_END) { + ret = size - 1 - strm.avail_out; + } else { + ret = (ret == Z_OK ? -1 : -2); + } + + deflateEnd(&strm); + + return ret; +} + +#define MAX_COMPRESS_THREADS 4 + +typedef struct Qcow2CompressData { + void *dest; + const void *src; + size_t size; + ssize_t ret; +} Qcow2CompressData; + +static int qcow2_compress_pool_func(void *opaque) +{ + Qcow2CompressData *data = opaque; + + data->ret = qcow2_compress(data->dest, data->src, data->size); + + return 0; +} + +static void qcow2_compress_complete(void *opaque, int ret) +{ + qemu_coroutine_enter(opaque); +} + +/* See qcow2_compress definition for parameters description */ +static ssize_t qcow2_co_compress(BlockDriverState *bs, + void *dest, const void *src, size_t size) +{ + BDRVQcow2State *s = bs->opaque; + BlockAIOCB *acb; + ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + Qcow2CompressData arg = { + .dest = dest, + .src = src, + .size = size, + }; + + while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) { + qemu_co_queue_wait(&s->compress_wait_queue, NULL); + } + + s->nb_compress_threads++; + acb = thread_pool_submit_aio(pool, qcow2_compress_pool_func, &arg, + qcow2_compress_complete, + qemu_coroutine_self()); + + if (!acb) { + s->nb_compress_threads--; + return -EINVAL; + } + qemu_coroutine_yield(); + s->nb_compress_threads--; + qemu_co_queue_next(&s->compress_wait_queue); + + return arg.ret; +} + /* XXX: put compressed sectors first, then all the cluster aligned tables to avoid losing bytes in alignment */ static coroutine_fn int @@ -3659,8 +3764,8 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, BDRVQcow2State *s = bs->opaque; QEMUIOVector hd_qiov; struct iovec iov; - z_stream strm; - int ret, out_len; + int ret; + size_t out_len; uint8_t *buf, *out_buf; int64_t cluster_offset; @@ -3694,32 +3799,11 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, out_buf = g_malloc(s->cluster_size); - /* best compression, small window, no zlib header */ - memset(&strm, 0, sizeof(strm)); - ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -12, - 9, Z_DEFAULT_STRATEGY); - if (ret != 0) { + out_len = qcow2_co_compress(bs, out_buf, buf, s->cluster_size); + if (out_len == -2) { ret = -EINVAL; goto fail; - } - - strm.avail_in = s->cluster_size; - strm.next_in = (uint8_t *)buf; - strm.avail_out = s->cluster_size; - strm.next_out = out_buf; - - ret = deflate(&strm, Z_FINISH); - if (ret != Z_STREAM_END && ret != Z_OK) { - deflateEnd(&strm); - ret = -EINVAL; - goto fail; - } - out_len = strm.next_out - out_buf; - - deflateEnd(&strm); - - if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + } else if (out_len == -1) { /* could not compress: write normal cluster */ ret = qcow2_co_pwritev(bs, offset, bytes, qiov, 0); if (ret < 0) { diff --git a/block/qcow2.h b/block/qcow2.h index 1c9c0d3..d6aca68 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -326,6 +326,9 @@ typedef struct BDRVQcow2State { * override) */ char *image_backing_file; char *image_backing_format; + + CoQueue compress_wait_queue; + int nb_compress_threads; } BDRVQcow2State; typedef struct Qcow2COWRegion { |