From e2462113b2003085ad16f15e1442ded64e2d9a29 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Mon, 12 Jan 2015 14:11:13 +0100 Subject: block: add event when disk usage exceeds threshold Managing applications, like oVirt (http://www.ovirt.org), make extensive use of thin-provisioned disk images. To let the guest run smoothly and be not unnecessarily paused, oVirt sets a disk usage threshold (so called 'high water mark') based on the occupation of the device, and automatically extends the image once the threshold is reached or exceeded. In order to detect the crossing of the threshold, oVirt has no choice but aggressively polling the QEMU monitor using the query-blockstats command. This lead to unnecessary system load, and is made even worse under scale: deployments with hundreds of VMs are no longer rare. To fix this, this patch adds: * A new monitor command `block-set-write-threshold', to set a mark for a given block device. * A new event `BLOCK_WRITE_THRESHOLD', to report if a block device usage exceeds the threshold. * A new `write_threshold' field into the `BlockDeviceInfo' structure, to report the configured threshold. This will allow the managing application to use smarter and more efficient monitoring, greatly reducing the need of polling. [Updated qemu-iotests 067 output to add the new 'write_threshold' property. --Stefan] [Changed g_assert_false() to !g_assert() to fix the build on older glib versions. --Kevin] Signed-off-by: Francesco Romani Reviewed-by: Eric Blake Message-id: 1421068273-692-1-git-send-email-fromani@redhat.com Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/Makefile.objs | 1 + block/qapi.c | 3 + block/write-threshold.c | 125 ++++++++++++++++++++++++++++++++++++++++ include/block/block_int.h | 4 ++ include/block/write-threshold.h | 64 ++++++++++++++++++++ qapi/block-core.json | 51 +++++++++++++++- qmp-commands.hx | 32 ++++++++++ tests/Makefile | 3 + tests/qemu-iotests/067.out | 5 ++ tests/test-write-threshold.c | 119 ++++++++++++++++++++++++++++++++++++++ 10 files changed, 406 insertions(+), 1 deletion(-) create mode 100644 block/write-threshold.c create mode 100644 include/block/write-threshold.h create mode 100644 tests/test-write-threshold.c diff --git a/block/Makefile.objs b/block/Makefile.objs index 04b0e43..010afad 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -20,6 +20,7 @@ block-obj-$(CONFIG_GLUSTERFS) += gluster.o block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o block-obj-$(CONFIG_LIBSSH2) += ssh.o block-obj-y += accounting.o +block-obj-y += write-threshold.o common-obj-y += stream.o common-obj-y += commit.o diff --git a/block/qapi.c b/block/qapi.c index d1a8917..1808e67 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -24,6 +24,7 @@ #include "block/qapi.h" #include "block/block_int.h" +#include "block/write-threshold.h" #include "qmp-commands.h" #include "qapi-visit.h" #include "qapi/qmp-output-visitor.h" @@ -89,6 +90,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) info->iops_size = cfg.op_size; } + info->write_threshold = bdrv_write_threshold_get(bs); + return info; } diff --git a/block/write-threshold.c b/block/write-threshold.c new file mode 100644 index 0000000..c2cd517 --- /dev/null +++ b/block/write-threshold.c @@ -0,0 +1,125 @@ +/* + * QEMU System Emulator block write threshold notification + * + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Francesco Romani + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ + +#include "block/block_int.h" +#include "block/coroutine.h" +#include "block/write-threshold.h" +#include "qemu/notify.h" +#include "qapi-event.h" +#include "qmp-commands.h" + + +uint64_t bdrv_write_threshold_get(const BlockDriverState *bs) +{ + return bs->write_threshold_offset; +} + +bool bdrv_write_threshold_is_set(const BlockDriverState *bs) +{ + return bs->write_threshold_offset > 0; +} + +static void write_threshold_disable(BlockDriverState *bs) +{ + if (bdrv_write_threshold_is_set(bs)) { + notifier_with_return_remove(&bs->write_threshold_notifier); + bs->write_threshold_offset = 0; + } +} + +uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs, + const BdrvTrackedRequest *req) +{ + if (bdrv_write_threshold_is_set(bs)) { + if (req->offset > bs->write_threshold_offset) { + return (req->offset - bs->write_threshold_offset) + req->bytes; + } + if ((req->offset + req->bytes) > bs->write_threshold_offset) { + return (req->offset + req->bytes) - bs->write_threshold_offset; + } + } + return 0; +} + +static int coroutine_fn before_write_notify(NotifierWithReturn *notifier, + void *opaque) +{ + BdrvTrackedRequest *req = opaque; + BlockDriverState *bs = req->bs; + uint64_t amount = 0; + + amount = bdrv_write_threshold_exceeded(bs, req); + if (amount > 0) { + qapi_event_send_block_write_threshold( + bs->node_name, + amount, + bs->write_threshold_offset, + &error_abort); + + /* autodisable to avoid flooding the monitor */ + write_threshold_disable(bs); + } + + return 0; /* should always let other notifiers run */ +} + +static void write_threshold_register_notifier(BlockDriverState *bs) +{ + bs->write_threshold_notifier.notify = before_write_notify; + notifier_with_return_list_add(&bs->before_write_notifiers, + &bs->write_threshold_notifier); +} + +static void write_threshold_update(BlockDriverState *bs, + int64_t threshold_bytes) +{ + bs->write_threshold_offset = threshold_bytes; +} + +void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes) +{ + if (bdrv_write_threshold_is_set(bs)) { + if (threshold_bytes > 0) { + write_threshold_update(bs, threshold_bytes); + } else { + write_threshold_disable(bs); + } + } else { + if (threshold_bytes > 0) { + /* avoid multiple registration */ + write_threshold_register_notifier(bs); + write_threshold_update(bs, threshold_bytes); + } + /* discard bogus disable request */ + } +} + +void qmp_block_set_write_threshold(const char *node_name, + uint64_t threshold_bytes, + Error **errp) +{ + BlockDriverState *bs; + AioContext *aio_context; + + bs = bdrv_find_node(node_name); + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, node_name); + return; + } + + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + bdrv_write_threshold_set(bs, threshold_bytes); + + aio_context_release(aio_context); +} diff --git a/include/block/block_int.h b/include/block/block_int.h index e264be9..7ad1950 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -412,6 +412,10 @@ struct BlockDriverState { /* The error object in use for blocking operations on backing_hd */ Error *backing_blocker; + + /* threshold limit for writes, in bytes. "High water mark". */ + uint64_t write_threshold_offset; + NotifierWithReturn write_threshold_notifier; }; diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h new file mode 100644 index 0000000..f1b899c --- /dev/null +++ b/include/block/write-threshold.h @@ -0,0 +1,64 @@ +/* + * QEMU System Emulator block write threshold notification + * + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Francesco Romani + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ +#ifndef BLOCK_WRITE_THRESHOLD_H +#define BLOCK_WRITE_THRESHOLD_H + +#include + +#include "qemu/typedefs.h" +#include "qemu-common.h" + +/* + * bdrv_write_threshold_set: + * + * Set the write threshold for block devices, in bytes. + * Notify when a write exceeds the threshold, meaning the device + * is becoming full, so it can be transparently resized. + * To be used with thin-provisioned block devices. + * + * Use threshold_bytes == 0 to disable. + */ +void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes); + +/* + * bdrv_write_threshold_get + * + * Get the configured write threshold, in bytes. + * Zero means no threshold configured. + */ +uint64_t bdrv_write_threshold_get(const BlockDriverState *bs); + +/* + * bdrv_write_threshold_is_set + * + * Tell if a write threshold is set for a given BDS. + */ +bool bdrv_write_threshold_is_set(const BlockDriverState *bs); + +/* + * bdrv_write_threshold_exceeded + * + * Return the extent of a write request that exceeded the threshold, + * or zero if the request is below the threshold. + * Return zero also if the threshold was not set. + * + * NOTE: here we assume the following holds for each request this code + * deals with: + * + * assert((req->offset + req->bytes) <= UINT64_MAX) + * + * Please not there is *not* an actual C assert(). + */ +uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs, + const BdrvTrackedRequest *req); + +#endif diff --git a/qapi/block-core.json b/qapi/block-core.json index b7d9772..a3fdaf0 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -257,6 +257,9 @@ # # @cache: the cache mode used for the block device (since: 2.3) # +# @write_threshold: configured write threshold for the device. +# 0 if disabled. (Since 2.3) +# # Since: 0.14.0 # ## @@ -271,7 +274,8 @@ '*bps_max': 'int', '*bps_rd_max': 'int', '*bps_wr_max': 'int', '*iops_max': 'int', '*iops_rd_max': 'int', '*iops_wr_max': 'int', - '*iops_size': 'int', 'cache': 'BlockdevCacheInfo' } } + '*iops_size': 'int', 'cache': 'BlockdevCacheInfo', + 'write_threshold': 'int' } } ## # @BlockDeviceIoStatus: @@ -1917,3 +1921,48 @@ ## { 'enum': 'PreallocMode', 'data': [ 'off', 'metadata', 'falloc', 'full' ] } + +## +# @BLOCK_WRITE_THRESHOLD +# +# Emitted when writes on block device reaches or exceeds the +# configured write threshold. For thin-provisioned devices, this +# means the device should be extended to avoid pausing for +# disk exhaustion. +# The event is one shot. Once triggered, it needs to be +# re-registered with another block-set-threshold command. +# +# @node-name: graph node name on which the threshold was exceeded. +# +# @amount-exceeded: amount of data which exceeded the threshold, in bytes. +# +# @write-threshold: last configured threshold, in bytes. +# +# Since: 2.3 +## +{ 'event': 'BLOCK_WRITE_THRESHOLD', + 'data': { 'node-name': 'str', + 'amount-exceeded': 'uint64', + 'write-threshold': 'uint64' } } + +## +# @block-set-write-threshold +# +# Change the write threshold for a block drive. An event will be delivered +# if a write to this block drive crosses the configured threshold. +# This is useful to transparently resize thin-provisioned drives without +# the guest OS noticing. +# +# @node-name: graph node name on which the threshold must be set. +# +# @write-threshold: configured threshold for the block device, bytes. +# Use 0 to disable the threshold. +# +# Returns: Nothing on success +# If @node name is not found on the block device graph, +# DeviceNotFound +# +# Since: 2.3 +## +{ 'command': 'block-set-write-threshold', + 'data': { 'node-name': 'str', 'write-threshold': 'uint64' } } diff --git a/qmp-commands.hx b/qmp-commands.hx index af3fd19..a85d847 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -2146,6 +2146,8 @@ Each json-object contain the following: - "iops_size": I/O size when limiting by iops (json-int) - "detect_zeroes": detect and optimize zero writing (json-string) - Possible values: "off", "on", "unmap" + - "write_threshold": write offset threshold in bytes, a event will be + emitted if crossed. Zero if disabled (json-int) - "image": the detail of the image, it is a json-object containing the following: - "filename": image file name (json-string) @@ -2223,6 +2225,7 @@ Example: "iops_wr_max": 0, "iops_size": 0, "detect_zeroes": "on", + "write_threshold": 0, "image":{ "filename":"disks/test.qcow2", "format":"qcow2", @@ -3685,6 +3688,7 @@ Example: "iops_rd_max": 0, "iops_wr_max": 0, "iops_size": 0, + "write_threshold": 0, "image":{ "filename":"disks/test.qcow2", "format":"qcow2", @@ -3921,3 +3925,31 @@ Move mouse pointer to absolute coordinates (20000, 400). <- { "return": {} } EQMP + + { + .name = "block-set-write-threshold", + .args_type = "node-name:s,write-threshold:l", + .mhandler.cmd_new = qmp_marshal_input_block_set_write_threshold, + }, + +SQMP +block-set-write-threshold +------------ + +Change the write threshold for a block drive. The threshold is an offset, +thus must be non-negative. Default is no write threshold. +Setting the threshold to zero disables it. + +Arguments: + +- "node-name": the node name in the block driver state graph (json-string) +- "write-threshold": the write threshold in bytes (json-int) + +Example: + +-> { "execute": "block-set-write-threshold", + "arguments": { "node-name": "mydev", + "write-threshold": 17179869184 } } +<- { "return": {} } + +EQMP diff --git a/tests/Makefile b/tests/Makefile index 5caccf7..d5df168 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -68,6 +68,8 @@ check-unit-y += tests/check-qom-interface$(EXESUF) gcov-files-check-qom-interface-y = qom/object.c check-unit-y += tests/test-qemu-opts$(EXESUF) gcov-files-test-qemu-opts-y = qom/test-qemu-opts.c +check-unit-y += tests/test-write-threshold$(EXESUF) +gcov-files-test-write-threshold-y = block/write-threshold.c check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh @@ -360,6 +362,7 @@ tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y) tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o $(qtest-obj-y) tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a libqemustub.a +tests/test-write-threshold$(EXESUF): tests/test-write-threshold.o $(block-obj-y) libqemuutil.a libqemustub.a ifeq ($(CONFIG_POSIX),y) LIBS += -lutil diff --git a/tests/qemu-iotests/067.out b/tests/qemu-iotests/067.out index 13ff3cd..00b3eae 100644 --- a/tests/qemu-iotests/067.out +++ b/tests/qemu-iotests/067.out @@ -43,6 +43,7 @@ Testing: -drive file=TEST_DIR/t.qcow2,format=qcow2,if=none,id=disk -device virti "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, @@ -218,6 +219,7 @@ Testing: -drive file=TEST_DIR/t.qcow2,format=qcow2,if=none,id=disk "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, @@ -423,6 +425,7 @@ Testing: "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, @@ -607,6 +610,7 @@ Testing: "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, @@ -717,6 +721,7 @@ Testing: "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, diff --git a/tests/test-write-threshold.c b/tests/test-write-threshold.c new file mode 100644 index 0000000..faffa7b --- /dev/null +++ b/tests/test-write-threshold.c @@ -0,0 +1,119 @@ +/* + * Test block device write threshold + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include +#include +#include "block/block_int.h" +#include "block/write-threshold.h" + + +static void test_threshold_not_set_on_init(void) +{ + uint64_t res; + BlockDriverState bs; + memset(&bs, 0, sizeof(bs)); + + g_assert(!bdrv_write_threshold_is_set(&bs)); + + res = bdrv_write_threshold_get(&bs); + g_assert_cmpint(res, ==, 0); +} + +static void test_threshold_set_get(void) +{ + uint64_t threshold = 4 * 1024 * 1024; + uint64_t res; + BlockDriverState bs; + memset(&bs, 0, sizeof(bs)); + + bdrv_write_threshold_set(&bs, threshold); + + g_assert(bdrv_write_threshold_is_set(&bs)); + + res = bdrv_write_threshold_get(&bs); + g_assert_cmpint(res, ==, threshold); +} + +static void test_threshold_multi_set_get(void) +{ + uint64_t threshold1 = 4 * 1024 * 1024; + uint64_t threshold2 = 15 * 1024 * 1024; + uint64_t res; + BlockDriverState bs; + memset(&bs, 0, sizeof(bs)); + + bdrv_write_threshold_set(&bs, threshold1); + bdrv_write_threshold_set(&bs, threshold2); + res = bdrv_write_threshold_get(&bs); + g_assert_cmpint(res, ==, threshold2); +} + +static void test_threshold_not_trigger(void) +{ + uint64_t amount = 0; + uint64_t threshold = 4 * 1024 * 1024; + BlockDriverState bs; + BdrvTrackedRequest req; + + memset(&bs, 0, sizeof(bs)); + memset(&req, 0, sizeof(req)); + req.offset = 1024; + req.bytes = 1024; + + bdrv_write_threshold_set(&bs, threshold); + amount = bdrv_write_threshold_exceeded(&bs, &req); + g_assert_cmpuint(amount, ==, 0); +} + + +static void test_threshold_trigger(void) +{ + uint64_t amount = 0; + uint64_t threshold = 4 * 1024 * 1024; + BlockDriverState bs; + BdrvTrackedRequest req; + + memset(&bs, 0, sizeof(bs)); + memset(&req, 0, sizeof(req)); + req.offset = (4 * 1024 * 1024) - 1024; + req.bytes = 2 * 1024; + + bdrv_write_threshold_set(&bs, threshold); + amount = bdrv_write_threshold_exceeded(&bs, &req); + g_assert_cmpuint(amount, >=, 1024); +} + +typedef struct TestStruct { + const char *name; + void (*func)(void); +} TestStruct; + + +int main(int argc, char **argv) +{ + size_t i; + TestStruct tests[] = { + { "/write-threshold/not-set-on-init", + test_threshold_not_set_on_init }, + { "/write-threshold/set-get", + test_threshold_set_get }, + { "/write-threshold/multi-set-get", + test_threshold_multi_set_get }, + { "/write-threshold/not-trigger", + test_threshold_not_trigger }, + { "/write-threshold/trigger", + test_threshold_trigger }, + { NULL, NULL } + }; + + g_test_init(&argc, &argv, NULL); + for (i = 0; tests[i].name != NULL; i++) { + g_test_add_func(tests[i].name, tests[i].func); + } + return g_test_run(); +} -- cgit v1.1