From 18658a3ced71e0fb23a0ef80ef0bbf1a9cbb7a2b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 14 Feb 2019 18:35:50 +0100 Subject: vhost: restrict Linux dependency to kernel vhost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vhost-user does not depend on Linux; it can run on any POSIX system. Restrict vhost-kernel to Linux in hw/virtio/vhost-backend.c, everything else can be compiled on all POSIX systems. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Thomas Huth Message-Id: <1543851204-41186-4-git-send-email-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini Message-Id: <1550165756-21617-4-git-send-email-pbonzini@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/Makefile.objs | 8 +++++--- hw/virtio/vhost-backend.c | 12 ++++++++++-- hw/virtio/vhost-user.c | 13 ++++++++++++- hw/virtio/vhost.c | 2 +- 4 files changed, 28 insertions(+), 7 deletions(-) (limited to 'hw/virtio') diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index d335dd0..ce542e7 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -2,15 +2,18 @@ ifeq ($(CONFIG_VIRTIO),y) common-obj-y += virtio-bus.o obj-y += virtio.o +obj-$(call lor,$(CONFIG_VHOST_USER),$(CONFIG_LINUX)) += vhost.o vhost-backend.o +common-obj-$(call lnot,$(call lor,$(CONFIG_VHOST_USER),$(CONFIG_LINUX))) += vhost-stub.o +obj-$(CONFIG_VHOST_USER) += vhost-user.o + common-obj-$(CONFIG_VIRTIO_RNG) += virtio-rng.o common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o common-obj-$(CONFIG_VIRTIO_MMIO) += virtio-mmio.o obj-$(CONFIG_VIRTIO_BALLOON) += virtio-balloon.o obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) += virtio-crypto-pci.o - -obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o + ifeq ($(CONFIG_VIRTIO_PCI),y) obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk-pci.o @@ -28,5 +31,4 @@ obj-$(CONFIG_VIRTIO_SERIAL) += virtio-serial-pci.o endif endif -common-obj-$(call lnot,$(call land,$(CONFIG_VIRTIO),$(CONFIG_LINUX))) += vhost-stub.o common-obj-$(CONFIG_ALL) += vhost-stub.o diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c index 7f09efa..e0f0bb7 100644 --- a/hw/virtio/vhost-backend.c +++ b/hw/virtio/vhost-backend.c @@ -9,11 +9,14 @@ */ #include "qemu/osdep.h" -#include -#include #include "hw/virtio/vhost.h" #include "hw/virtio/vhost-backend.h" #include "qemu/error-report.h" +#include "standard-headers/linux/vhost_types.h" + +#ifdef CONFIG_LINUX +#include +#include static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request, void *arg) @@ -265,18 +268,23 @@ static const VhostOps kernel_ops = { .vhost_set_iotlb_callback = vhost_kernel_set_iotlb_callback, .vhost_send_device_iotlb_msg = vhost_kernel_send_device_iotlb_msg, }; +#endif int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type) { int r = 0; switch (backend_type) { +#ifdef CONFIG_LINUX case VHOST_BACKEND_TYPE_KERNEL: dev->vhost_ops = &kernel_ops; break; +#endif +#ifdef CONFIG_VHOST_USER case VHOST_BACKEND_TYPE_USER: dev->vhost_ops = &user_ops; break; +#endif default: error_report("Unknown vhost backend type"); r = -1; diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 564a31d..0d6c64e 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -27,8 +27,12 @@ #include #include #include -#include + +#include "standard-headers/linux/vhost_types.h" + +#ifdef CONFIG_LINUX #include +#endif #define VHOST_MEMORY_MAX_NREGIONS 8 #define VHOST_USER_F_PROTOCOL_FEATURES 30 @@ -1110,6 +1114,7 @@ out: return ret; } +#ifdef CONFIG_LINUX /* * Called back from the postcopy fault thread when a fault is received on our * ufd. @@ -1177,6 +1182,7 @@ static int vhost_user_postcopy_waker(struct PostCopyFD *pcfd, RAMBlock *rb, trace_vhost_user_postcopy_waker_nomatch(qemu_ram_get_idstr(rb), offset); return 0; } +#endif /* * Called at the start of an inbound postcopy on reception of the @@ -1184,6 +1190,7 @@ static int vhost_user_postcopy_waker(struct PostCopyFD *pcfd, RAMBlock *rb, */ static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp) { +#ifdef CONFIG_LINUX struct vhost_user *u = dev->opaque; CharBackend *chr = u->user->chr; int ufd; @@ -1227,6 +1234,10 @@ static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp) u->postcopy_fd.idstr = "vhost-user"; /* Need to find unique name */ postcopy_register_shared_ufd(&u->postcopy_fd); return 0; +#else + error_setg(errp, "Postcopy not supported on non-Linux systems"); + return -1; +#endif } /* diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 569c405..311432f 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -21,7 +21,7 @@ #include "qemu/range.h" #include "qemu/error-report.h" #include "qemu/memfd.h" -#include +#include "standard-headers/linux/vhost_types.h" #include "exec/address-spaces.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" -- cgit v1.1 From 299e6f19b3e285f3ab00b780e3b48f5e58b5d4ed Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 14 Feb 2019 18:35:53 +0100 Subject: vhost-net: revamp configure logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Detect all invalid configurations (e.g. mingw32 with vhost-user, non-Linux with vhost-kernel). As a collateral benefit, all vhost-kernel backends can be now disabled if one wants to reduce the attack surface. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Paolo Bonzini Reviewed-by: Thomas Huth Message-Id: <1543851204-41186-6-git-send-email-pbonzini@redhat.com> Message-Id: <1550165756-21617-7-git-send-email-pbonzini@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/Makefile.objs | 4 ++-- hw/virtio/vhost-backend.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'hw/virtio') diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index ce542e7..a3eb8ed 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -2,8 +2,8 @@ ifeq ($(CONFIG_VIRTIO),y) common-obj-y += virtio-bus.o obj-y += virtio.o -obj-$(call lor,$(CONFIG_VHOST_USER),$(CONFIG_LINUX)) += vhost.o vhost-backend.o -common-obj-$(call lnot,$(call lor,$(CONFIG_VHOST_USER),$(CONFIG_LINUX))) += vhost-stub.o +obj-$(call lor,$(CONFIG_VHOST_USER),$(CONFIG_VHOST_KERNEL)) += vhost.o vhost-backend.o +common-obj-$(call lnot,$(call lor,$(CONFIG_VHOST_USER),$(CONFIG_VHOST_KERNEL))) += vhost-stub.o obj-$(CONFIG_VHOST_USER) += vhost-user.o common-obj-$(CONFIG_VIRTIO_RNG) += virtio-rng.o diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c index e0f0bb7..96b8d3c 100644 --- a/hw/virtio/vhost-backend.c +++ b/hw/virtio/vhost-backend.c @@ -14,7 +14,7 @@ #include "qemu/error-report.h" #include "standard-headers/linux/vhost_types.h" -#ifdef CONFIG_LINUX +#ifdef CONFIG_VHOST_KERNEL #include #include @@ -275,7 +275,7 @@ int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type) int r = 0; switch (backend_type) { -#ifdef CONFIG_LINUX +#ifdef CONFIG_VHOST_KERNEL case VHOST_BACKEND_TYPE_KERNEL: dev->vhost_ops = &kernel_ops; break; -- cgit v1.1 From f6deb6d95aa7c29fa0047057512060ca720cad22 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 14 Feb 2019 15:39:12 +1100 Subject: virtio-balloon: Remove unnecessary MADV_WILLNEED on deflate When the balloon is inflated, we discard memory place in it using madvise() with MADV_DONTNEED. And when we deflate it we use MADV_WILLNEED, which sounds like it makes sense but is actually unnecessary. The misleadingly named MADV_DONTNEED just discards the memory in question, it doesn't set any persistent state on it in-kernel; all that's necessary to bring the memory back is to touch it. MADV_WILLNEED in contrast specifically says that the memory will be used soon and faults it in. This patch simplify's the balloon operation by dropping the madvise() on deflate. This might have an impact on performance - it will move a delay at deflate time until that memory is actually touched, which might be more latency sensitive. However: * Memory that's being given back to the guest by deflating the balloon *might* be used soon, but it equally could just sit around in the guest's pools until needed (or even be faulted out again if the host is under memory pressure). * Usually, the timescale over which you'll be adjusting the balloon is long enough that a few extra faults after deflation aren't going to make a difference. Signed-off-by: David Gibson Reviewed-by: David Hildenbrand Reviewed-by: Michael S. Tsirkin Message-Id: <20190214043916.22128-2-david@gibson.dropbear.id.au> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-balloon.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'hw/virtio') diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index a12677d..43af521 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -35,9 +35,8 @@ static void balloon_page(void *addr, int deflate) { - if (!qemu_balloon_is_inhibited()) { - qemu_madvise(addr, BALLOON_PAGE_SIZE, - deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); + if (!qemu_balloon_is_inhibited() && !deflate) { + qemu_madvise(addr, BALLOON_PAGE_SIZE, QEMU_MADV_DONTNEED); } } -- cgit v1.1 From b218a70e6ae882f52cc339ae965f515a36a9139f Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 14 Feb 2019 15:39:13 +1100 Subject: virtio-balloon: Corrections to address verification The virtio-balloon device's verification of the address given to it by the guest has a number of faults: * The addresses here are guest physical addresses, which should be 'hwaddr' rather than 'ram_addr_t' (the distinction is admittedly pretty subtle and confusing) * We don't check for section.mr being NULL, which is the main way that memory_region_find() reports basic failures. We really need to check that before looking at any other section fields, because memory_region_find() doesn't initialize them on the failure path * We're passing a length of '1' to memory_region_find(), but really the guest is requesting that we put the entire page into the balloon, so it makes more sense to call it with BALLOON_PAGE_SIZE Signed-off-by: David Gibson Reviewed-by: David Hildenbrand Reviewed-by: Michael S. Tsirkin Message-Id: <20190214043916.22128-3-david@gibson.dropbear.id.au> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-balloon.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'hw/virtio') diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 43af521..eb35782 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -221,17 +221,20 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) } while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) { - ram_addr_t pa; - ram_addr_t addr; + hwaddr pa; + hwaddr addr; int p = virtio_ldl_p(vdev, &pfn); - pa = (ram_addr_t) p << VIRTIO_BALLOON_PFN_SHIFT; + pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT; offset += 4; - /* FIXME: remove get_system_memory(), but how? */ - section = memory_region_find(get_system_memory(), pa, 1); - if (!int128_nz(section.size) || - !memory_region_is_ram(section.mr) || + section = memory_region_find(get_system_memory(), pa, + BALLOON_PAGE_SIZE); + if (!section.mr) { + trace_virtio_balloon_bad_addr(pa); + continue; + } + if (!memory_region_is_ram(section.mr) || memory_region_is_rom(section.mr) || memory_region_is_romd(section.mr)) { trace_virtio_balloon_bad_addr(pa); -- cgit v1.1 From e9550234d79ddb69b01721d8cb197edc0a14a245 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 14 Feb 2019 15:39:14 +1100 Subject: virtio-balloon: Rework ballon_page() interface This replaces the balloon_page() internal interface with ballon_inflate_page(), with a slightly different interface. The new interface will make future alterations simpler. Signed-off-by: David Gibson Reviewed-by: David Hildenbrand Reviewed-by: Michael S. Tsirkin Message-Id: <20190214043916.22128-4-david@gibson.dropbear.id.au> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-balloon.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'hw/virtio') diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index eb35782..bf93148 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -33,11 +33,12 @@ #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) -static void balloon_page(void *addr, int deflate) +static void balloon_inflate_page(VirtIOBalloon *balloon, + MemoryRegion *mr, hwaddr offset) { - if (!qemu_balloon_is_inhibited() && !deflate) { - qemu_madvise(addr, BALLOON_PAGE_SIZE, QEMU_MADV_DONTNEED); - } + void *addr = memory_region_get_ram_ptr(mr) + offset; + + qemu_madvise(addr, BALLOON_PAGE_SIZE, QEMU_MADV_DONTNEED); } static const char *balloon_stat_names[] = { @@ -222,7 +223,6 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) { hwaddr pa; - hwaddr addr; int p = virtio_ldl_p(vdev, &pfn); pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT; @@ -244,11 +244,9 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) trace_virtio_balloon_handle_output(memory_region_name(section.mr), pa); - /* Using memory_region_get_ram_ptr is bending the rules a bit, but - should be OK because we only want a single page. */ - addr = section.offset_within_region; - balloon_page(memory_region_get_ram_ptr(section.mr) + addr, - !!(vq == s->dvq)); + if (!qemu_balloon_is_inhibited() && vq != s->dvq) { + balloon_inflate_page(s, section.mr, section.offset_within_region); + } memory_region_unref(section.mr); } -- cgit v1.1 From dbe1a2774521d838c34b831d89a4bb646a8e9d7c Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 14 Feb 2019 15:39:15 +1100 Subject: virtio-balloon: Use ram_block_discard_range() instead of raw madvise() Currently, virtio-balloon uses madvise() with MADV_DONTNEED to actually discard RAM pages inserted into the balloon. This is basically a Linux only interface (MADV_DONTNEED exists on some other platforms, but doesn't always have the same semantics). It also doesn't work on hugepages and has some other limitations. It turns out that postcopy also needs to discard chunks of memory, and uses a better interface for it: ram_block_discard_range(). It doesn't cover every case, but it covers more than going direct to madvise() and this gives us a single place to update for more possibilities in future. There are some subtleties here to maintain the current balloon behaviour: * For now, we just ignore requests to balloon in a hugepage backed region. That matches current behaviour, because MADV_DONTNEED on a hugepage would simply fail, and we ignore the error. * If host page size is > BALLOON_PAGE_SIZE we can frequently call this on non-host-page-aligned addresses. These would also fail in madvise(), which we then ignored. ram_block_discard_range() error_report()s calls on unaligned addresses, so we explicitly check that case to avoid spamming the logs. * We now call ram_block_discard_range() with the *host* page size, whereas we previously called madvise() with BALLOON_PAGE_SIZE. Surprisingly, this also matches existing behaviour. Although the kernel fails madvise on unaligned addresses, it will round unaligned sizes *up* to the host page size. Yes, this means that if BALLOON_PAGE_SIZE < guest page size we can incorrectly discard more memory than the guest asked us to. I'm planning to address that soon. Errors other than the ones discussed above, will now be reported by ram_block_discard_range(), rather than silently ignored, which means we have a much better chance of seeing when something is going wrong. Signed-off-by: David Gibson Reviewed-by: Michael S. Tsirkin Message-Id: <20190214043916.22128-5-david@gibson.dropbear.id.au> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-balloon.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'hw/virtio') diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index bf93148..e4cd8d5 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -37,8 +37,29 @@ static void balloon_inflate_page(VirtIOBalloon *balloon, MemoryRegion *mr, hwaddr offset) { void *addr = memory_region_get_ram_ptr(mr) + offset; + RAMBlock *rb; + size_t rb_page_size; + ram_addr_t ram_offset; - qemu_madvise(addr, BALLOON_PAGE_SIZE, QEMU_MADV_DONTNEED); + /* XXX is there a better way to get to the RAMBlock than via a + * host address? */ + rb = qemu_ram_block_from_host(addr, false, &ram_offset); + rb_page_size = qemu_ram_pagesize(rb); + + /* Silently ignore hugepage RAM blocks */ + if (rb_page_size != getpagesize()) { + return; + } + + /* Silently ignore unaligned requests */ + if (ram_offset & (rb_page_size - 1)) { + return; + } + + ram_block_discard_range(rb, ram_offset, rb_page_size); + /* We ignore errors from ram_block_discard_range(), because it has + * already reported them, and failing to discard a balloon page is + * not fatal */ } static const char *balloon_stat_names[] = { -- cgit v1.1 From ed48c59875b603058366490f472490f0fb9c30f3 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 14 Feb 2019 15:39:16 +1100 Subject: virtio-balloon: Safely handle BALLOON_PAGE_SIZE < host page size The virtio-balloon always works in units of 4kiB (BALLOON_PAGE_SIZE), but we can only actually discard memory in units of the host page size. Now, we handle this very badly: we silently ignore balloon requests that aren't host page aligned, and for requests that are host page aligned we discard the entire host page. The latter can corrupt guest memory if its page size is smaller than the host's. The obvious choice would be to disable the balloon if the host page size is not 4kiB. However, that would break the special case where host and guest have the same page size, but that's larger than 4kiB. That case currently works by accident[1] - and is used in practice on many production POWER systems where 64kiB has long been the Linux default page size on both host and guest. To make the balloon safe, without breaking that useful special case, we need to accumulate 4kiB balloon requests until we have a whole contiguous host page to discard. We could in principle do that across all guest memory, but it would require a large bitmap to track. This patch represents a compromise: we track ballooned subpages for a single contiguous host page at a time. This means that if the guest discards all 4kiB chunks of a host page in succession, we will discard it. This is the expected behaviour in the (host page) == (guest page) != 4kiB case we want to support. If the guest scatters 4kiB requests across different host pages, we don't discard anything, and issue a warning. Not ideal, but at least we don't corrupt guest memory as the previous version could. Warning reporting is kind of a compromise here. Determining whether we're in a problematic state at realize() time is tricky, because we'd have to look at the host pagesizes of all memory backends, but we can't really know if some of those backends could be for special purpose memory that's not subject to ballooning. Reporting only when the guest tries to balloon a partial page also isn't great because if the guest page size happens to line up it won't indicate that we're in a non ideal situation. It could also cause alarming repeated warnings whenever a migration is attempted. So, what we do is warn the first time the guest attempts balloon a partial host page, whether or not it will end up ballooning the rest of the page immediately afterwards. [1] Because when the guest attempts to balloon a page, it will submit requests for each 4kiB subpage. Most will be ignored, but the one which happens to be host page aligned will discard the whole lot. Signed-off-by: David Gibson Message-Id: <20190214043916.22128-6-david@gibson.dropbear.id.au> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-balloon.c | 69 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 10 deletions(-) (limited to 'hw/virtio') diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index e4cd8d5..d3f2913 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -33,33 +33,82 @@ #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) +struct PartiallyBalloonedPage { + RAMBlock *rb; + ram_addr_t base; + unsigned long bitmap[]; +}; + static void balloon_inflate_page(VirtIOBalloon *balloon, MemoryRegion *mr, hwaddr offset) { void *addr = memory_region_get_ram_ptr(mr) + offset; RAMBlock *rb; size_t rb_page_size; - ram_addr_t ram_offset; + int subpages; + ram_addr_t ram_offset, host_page_base; /* XXX is there a better way to get to the RAMBlock than via a * host address? */ rb = qemu_ram_block_from_host(addr, false, &ram_offset); rb_page_size = qemu_ram_pagesize(rb); + host_page_base = ram_offset & ~(rb_page_size - 1); - /* Silently ignore hugepage RAM blocks */ - if (rb_page_size != getpagesize()) { + if (rb_page_size == BALLOON_PAGE_SIZE) { + /* Easy case */ + + ram_block_discard_range(rb, ram_offset, rb_page_size); + /* We ignore errors from ram_block_discard_range(), because it + * has already reported them, and failing to discard a balloon + * page is not fatal */ return; } - /* Silently ignore unaligned requests */ - if (ram_offset & (rb_page_size - 1)) { - return; + /* Hard case + * + * We've put a piece of a larger host page into the balloon - we + * need to keep track until we have a whole host page to + * discard + */ + warn_report_once( +"Balloon used with backing page size > 4kiB, this may not be reliable"); + + subpages = rb_page_size / BALLOON_PAGE_SIZE; + + if (balloon->pbp + && (rb != balloon->pbp->rb + || host_page_base != balloon->pbp->base)) { + /* We've partially ballooned part of a host page, but now + * we're trying to balloon part of a different one. Too hard, + * give up on the old partial page */ + free(balloon->pbp); + balloon->pbp = NULL; } - ram_block_discard_range(rb, ram_offset, rb_page_size); - /* We ignore errors from ram_block_discard_range(), because it has - * already reported them, and failing to discard a balloon page is - * not fatal */ + if (!balloon->pbp) { + /* Starting on a new host page */ + size_t bitlen = BITS_TO_LONGS(subpages) * sizeof(unsigned long); + balloon->pbp = g_malloc0(sizeof(PartiallyBalloonedPage) + bitlen); + balloon->pbp->rb = rb; + balloon->pbp->base = host_page_base; + } + + bitmap_set(balloon->pbp->bitmap, + (ram_offset - balloon->pbp->base) / BALLOON_PAGE_SIZE, + subpages); + + if (bitmap_full(balloon->pbp->bitmap, subpages)) { + /* We've accumulated a full host page, we can actually discard + * it now */ + + ram_block_discard_range(rb, balloon->pbp->base, rb_page_size); + /* We ignore errors from ram_block_discard_range(), because it + * has already reported them, and failing to discard a balloon + * page is not fatal */ + + free(balloon->pbp); + balloon->pbp = NULL; + } } static const char *balloon_stat_names[] = { -- cgit v1.1