aboutsummaryrefslogtreecommitdiff
path: root/migration
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2022-12-15 14:52:12 +0000
committerPeter Maydell <peter.maydell@linaro.org>2022-12-15 14:52:13 +0000
commit928eac953918cbd9b237d7cb8b937a6fc575d009 (patch)
tree1ff8dc7745a748859b089739e1076a205100b380 /migration
parent48804eebd4a327e4b11f902ba80a00876ee53a43 (diff)
parent7f401b80445e8746202a6d643410ba1b9eeb3cb1 (diff)
downloadqemu-928eac953918cbd9b237d7cb8b937a6fc575d009.zip
qemu-928eac953918cbd9b237d7cb8b937a6fc575d009.tar.gz
qemu-928eac953918cbd9b237d7cb8b937a6fc575d009.tar.bz2
Merge tag 'next-8.0-pull-request' of https://gitlab.com/juan.quintela/qemu into staging
Migration patches for 8.0 Hi This are the patches that I had to drop form the last PULL request because they werent fixes: - AVX2 is dropped, intel posted a fix, I have to redo it - Fix for out of order channels is out Daniel nacked it and I need to redo it # gpg: Signature made Thu 15 Dec 2022 09:38:29 GMT # gpg: using RSA key 1899FF8EDEBF58CCEE034B82F487EF185872D723 # gpg: Good signature from "Juan Quintela <quintela@redhat.com>" [full] # gpg: aka "Juan Quintela <quintela@trasno.org>" [full] # Primary key fingerprint: 1899 FF8E DEBF 58CC EE03 4B82 F487 EF18 5872 D723 * tag 'next-8.0-pull-request' of https://gitlab.com/juan.quintela/qemu: migration: Drop rs->f migration: Remove old preempt code around state maintainance migration: Send requested page directly in rp-return thread migration: Move last_sent_block into PageSearchStatus migration: Make PageSearchStatus part of RAMState migration: Add pss_init() migration: Introduce pss_channel migration: Teach PSS about host page migration: Use atomic ops properly for page accountings migration: Yield bitmap_mutex properly when sending/sleeping migration: Remove RAMState.f references in compression code migration: Trivial cleanup save_page_header() on same block check migration: Cleanup xbzrle zero page cache update logic migration: Add postcopy_preempt_active() migration: Take bitmap mutex when completing ram migration migration: Export ram_release_page() migration: Export ram_transferred_ram() multifd: Create page_count fields into both MultiFD{Recv,Send}Params multifd: Create page_size fields into both MultiFD{Recv,Send}Params Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'migration')
-rw-r--r--migration/migration.c47
-rw-r--r--migration/migration.h7
-rw-r--r--migration/multifd-zlib.c14
-rw-r--r--migration/multifd-zstd.c12
-rw-r--r--migration/multifd.c27
-rw-r--r--migration/multifd.h8
-rw-r--r--migration/ram.c729
-rw-r--r--migration/ram.h23
8 files changed, 419 insertions, 448 deletions
diff --git a/migration/migration.c b/migration/migration.c
index 78a0b01..52b5d39 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1045,13 +1045,13 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s)
size_t page_size = qemu_target_page_size();
info->ram = g_malloc0(sizeof(*info->ram));
- info->ram->transferred = ram_counters.transferred;
+ info->ram->transferred = stat64_get(&ram_atomic_counters.transferred);
info->ram->total = ram_bytes_total();
- info->ram->duplicate = ram_counters.duplicate;
+ info->ram->duplicate = stat64_get(&ram_atomic_counters.duplicate);
/* legacy value. It is not used anymore */
info->ram->skipped = 0;
- info->ram->normal = ram_counters.normal;
- info->ram->normal_bytes = ram_counters.normal * page_size;
+ info->ram->normal = stat64_get(&ram_atomic_counters.normal);
+ info->ram->normal_bytes = info->ram->normal * page_size;
info->ram->mbps = s->mbps;
info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
info->ram->dirty_sync_missed_zero_copy =
@@ -1062,7 +1062,7 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s)
info->ram->pages_per_second = s->pages_per_second;
info->ram->precopy_bytes = ram_counters.precopy_bytes;
info->ram->downtime_bytes = ram_counters.downtime_bytes;
- info->ram->postcopy_bytes = ram_counters.postcopy_bytes;
+ info->ram->postcopy_bytes = stat64_get(&ram_atomic_counters.postcopy_bytes);
if (migrate_use_xbzrle()) {
info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
@@ -2840,8 +2840,11 @@ static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
return 0;
}
-/* Release ms->rp_state.from_dst_file in a safe way */
-static void migration_release_from_dst_file(MigrationState *ms)
+/*
+ * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
+ * existed) in a safe way.
+ */
+static void migration_release_dst_files(MigrationState *ms)
{
QEMUFile *file;
@@ -2854,6 +2857,18 @@ static void migration_release_from_dst_file(MigrationState *ms)
ms->rp_state.from_dst_file = NULL;
}
+ /*
+ * Do the same to postcopy fast path socket too if there is. No
+ * locking needed because this qemufile should only be managed by
+ * return path thread.
+ */
+ if (ms->postcopy_qemufile_src) {
+ migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
+ qemu_file_shutdown(ms->postcopy_qemufile_src);
+ qemu_fclose(ms->postcopy_qemufile_src);
+ ms->postcopy_qemufile_src = NULL;
+ }
+
qemu_fclose(file);
}
@@ -2998,7 +3013,7 @@ out:
* Maybe there is something we can do: it looks like a
* network down issue, and we pause for a recovery.
*/
- migration_release_from_dst_file(ms);
+ migration_release_dst_files(ms);
rp = NULL;
if (postcopy_pause_return_path_thread(ms)) {
/*
@@ -3016,7 +3031,7 @@ out:
}
trace_source_return_path_thread_end();
- migration_release_from_dst_file(ms);
+ migration_release_dst_files(ms);
rcu_unregister_thread();
return NULL;
}
@@ -3539,18 +3554,6 @@ static MigThrError postcopy_pause(MigrationState *s)
qemu_file_shutdown(file);
qemu_fclose(file);
- /*
- * Do the same to postcopy fast path socket too if there is. No
- * locking needed because no racer as long as we do this before setting
- * status to paused.
- */
- if (s->postcopy_qemufile_src) {
- migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
- qemu_file_shutdown(s->postcopy_qemufile_src);
- qemu_fclose(s->postcopy_qemufile_src);
- s->postcopy_qemufile_src = NULL;
- }
-
migrate_set_state(&s->state, s->state,
MIGRATION_STATUS_POSTCOPY_PAUSED);
@@ -4391,8 +4394,6 @@ static Property migration_properties[] = {
DEFINE_PROP_SIZE("announce-step", MigrationState,
parameters.announce_step,
DEFAULT_MIGRATE_ANNOUNCE_STEP),
- DEFINE_PROP_BOOL("x-postcopy-preempt-break-huge", MigrationState,
- postcopy_preempt_break_huge, true),
DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds),
DEFINE_PROP_STRING("tls-hostname", MigrationState, parameters.tls_hostname),
DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
diff --git a/migration/migration.h b/migration/migration.h
index cdad8ac..ae4ffd3 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -340,13 +340,6 @@ struct MigrationState {
bool send_configuration;
/* Whether we send section footer during migration */
bool send_section_footer;
- /*
- * Whether we allow break sending huge pages when postcopy preempt is
- * enabled. When disabled, we won't interrupt precopy within sending a
- * host huge page, which is the old behavior of vanilla postcopy.
- * NOTE: this parameter is ignored if postcopy preempt is not enabled.
- */
- bool postcopy_preempt_break_huge;
/* Needed by postcopy-pause state */
QemuSemaphore postcopy_pause_sem;
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 18213a9..3777024 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -116,7 +116,6 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
{
struct zlib_data *z = p->data;
- size_t page_size = qemu_target_page_size();
z_stream *zs = &z->zs;
uint32_t out_size = 0;
int ret;
@@ -135,8 +134,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
* with compression. zlib does not guarantee that this is safe,
* therefore copy the page before calling deflate().
*/
- memcpy(z->buf, p->pages->block->host + p->normal[i], page_size);
- zs->avail_in = page_size;
+ memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size);
+ zs->avail_in = p->page_size;
zs->next_in = z->buf;
zs->avail_out = available;
@@ -242,12 +241,11 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p)
static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
{
struct zlib_data *z = p->data;
- size_t page_size = qemu_target_page_size();
z_stream *zs = &z->zs;
uint32_t in_size = p->next_packet_size;
/* we measure the change of total_out */
uint32_t out_size = zs->total_out;
- uint32_t expected_size = p->normal_num * page_size;
+ uint32_t expected_size = p->normal_num * p->page_size;
uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
int ret;
int i;
@@ -274,7 +272,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
flush = Z_SYNC_FLUSH;
}
- zs->avail_out = page_size;
+ zs->avail_out = p->page_size;
zs->next_out = p->host + p->normal[i];
/*
@@ -288,8 +286,8 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
do {
ret = inflate(zs, flush);
} while (ret == Z_OK && zs->avail_in
- && (zs->total_out - start) < page_size);
- if (ret == Z_OK && (zs->total_out - start) < page_size) {
+ && (zs->total_out - start) < p->page_size);
+ if (ret == Z_OK && (zs->total_out - start) < p->page_size) {
error_setg(errp, "multifd %u: inflate generated too few output",
p->id);
return -1;
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index d788d30..f4a8e1e 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -113,7 +113,6 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp)
static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
{
struct zstd_data *z = p->data;
- size_t page_size = qemu_target_page_size();
int ret;
uint32_t i;
@@ -128,7 +127,7 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
flush = ZSTD_e_flush;
}
z->in.src = p->pages->block->host + p->normal[i];
- z->in.size = page_size;
+ z->in.size = p->page_size;
z->in.pos = 0;
/*
@@ -241,8 +240,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
{
uint32_t in_size = p->next_packet_size;
uint32_t out_size = 0;
- size_t page_size = qemu_target_page_size();
- uint32_t expected_size = p->normal_num * page_size;
+ uint32_t expected_size = p->normal_num * p->page_size;
uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
struct zstd_data *z = p->data;
int ret;
@@ -265,7 +263,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
for (i = 0; i < p->normal_num; i++) {
z->out.dst = p->host + p->normal[i];
- z->out.size = page_size;
+ z->out.size = p->page_size;
z->out.pos = 0;
/*
@@ -279,8 +277,8 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
do {
ret = ZSTD_decompressStream(z->zds, &z->out, &z->in);
} while (ret > 0 && (z->in.size - z->in.pos > 0)
- && (z->out.pos < page_size));
- if (ret > 0 && (z->out.pos < page_size)) {
+ && (z->out.pos < p->page_size));
+ if (ret > 0 && (z->out.pos < p->page_size)) {
error_setg(errp, "multifd %u: decompressStream buffer too small",
p->id);
return -1;
diff --git a/migration/multifd.c b/migration/multifd.c
index 509bbbe..000ca4d 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -87,15 +87,14 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
{
MultiFDPages_t *pages = p->pages;
- size_t page_size = qemu_target_page_size();
for (int i = 0; i < p->normal_num; i++) {
p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i];
- p->iov[p->iovs_num].iov_len = page_size;
+ p->iov[p->iovs_num].iov_len = p->page_size;
p->iovs_num++;
}
- p->next_packet_size = p->normal_num * page_size;
+ p->next_packet_size = p->normal_num * p->page_size;
p->flags |= MULTIFD_FLAG_NOCOMP;
return 0;
}
@@ -139,7 +138,6 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p)
static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
{
uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
- size_t page_size = qemu_target_page_size();
if (flags != MULTIFD_FLAG_NOCOMP) {
error_setg(errp, "multifd %u: flags received %x flags expected %x",
@@ -148,7 +146,7 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
}
for (int i = 0; i < p->normal_num; i++) {
p->iov[i].iov_base = p->host + p->normal[i];
- p->iov[i].iov_len = page_size;
+ p->iov[i].iov_len = p->page_size;
}
return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
}
@@ -281,8 +279,6 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
{
MultiFDPacket_t *packet = p->packet;
- size_t page_size = qemu_target_page_size();
- uint32_t page_count = MULTIFD_PACKET_SIZE / page_size;
RAMBlock *block;
int i;
@@ -309,10 +305,10 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
* If we received a packet that is 100 times bigger than expected
* just stop migration. It is a magic number.
*/
- if (packet->pages_alloc > page_count) {
+ if (packet->pages_alloc > p->page_count) {
error_setg(errp, "multifd: received packet "
"with size %u and expected a size of %u",
- packet->pages_alloc, page_count) ;
+ packet->pages_alloc, p->page_count) ;
return -1;
}
@@ -344,7 +340,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
for (i = 0; i < p->normal_num; i++) {
uint64_t offset = be64_to_cpu(packet->offset[i]);
- if (offset > (block->used_length - page_size)) {
+ if (offset > (block->used_length - p->page_size)) {
error_setg(errp, "multifd: offset too long %" PRIu64
" (max " RAM_ADDR_FMT ")",
offset, block->used_length);
@@ -433,11 +429,10 @@ static int multifd_send_pages(QEMUFile *f)
p->packet_num = multifd_send_state->packet_num++;
multifd_send_state->pages = p->pages;
p->pages = pages;
- transferred = ((uint64_t) pages->num) * qemu_target_page_size()
- + p->packet_len;
+ transferred = ((uint64_t) pages->num) * p->page_size + p->packet_len;
qemu_file_acct_rate_limit(f, transferred);
ram_counters.multifd_bytes += transferred;
- ram_counters.transferred += transferred;
+ stat64_add(&ram_atomic_counters.transferred, transferred);
qemu_mutex_unlock(&p->mutex);
qemu_sem_post(&p->sem);
@@ -629,7 +624,7 @@ int multifd_send_sync_main(QEMUFile *f)
p->pending_job++;
qemu_file_acct_rate_limit(f, p->packet_len);
ram_counters.multifd_bytes += p->packet_len;
- ram_counters.transferred += p->packet_len;
+ stat64_add(&ram_atomic_counters.transferred, p->packet_len);
qemu_mutex_unlock(&p->mutex);
qemu_sem_post(&p->sem);
@@ -947,6 +942,8 @@ int multifd_save_setup(Error **errp)
/* We need one extra place for the packet header */
p->iov = g_new0(struct iovec, page_count + 1);
p->normal = g_new0(ram_addr_t, page_count);
+ p->page_size = qemu_target_page_size();
+ p->page_count = page_count;
if (migrate_use_zero_copy_send()) {
p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
@@ -1194,6 +1191,8 @@ int multifd_load_setup(Error **errp)
p->name = g_strdup_printf("multifdrecv_%d", i);
p->iov = g_new0(struct iovec, page_count);
p->normal = g_new0(ram_addr_t, page_count);
+ p->page_count = page_count;
+ p->page_size = qemu_target_page_size();
}
for (i = 0; i < thread_count; i++) {
diff --git a/migration/multifd.h b/migration/multifd.h
index 519f498..e2802a9 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -80,6 +80,10 @@ typedef struct {
bool registered_yank;
/* packet allocated len */
uint32_t packet_len;
+ /* guest page size */
+ uint32_t page_size;
+ /* number of pages in a full packet */
+ uint32_t page_count;
/* multifd flags for sending ram */
int write_flags;
@@ -143,6 +147,10 @@ typedef struct {
QIOChannel *c;
/* packet allocated len */
uint32_t packet_len;
+ /* guest page size */
+ uint32_t page_size;
+ /* number of pages in a full packet */
+ uint32_t page_count;
/* syncs main thread and channels */
QemuSemaphore sem_sync;
diff --git a/migration/ram.c b/migration/ram.c
index 1338e47..334309f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -85,6 +85,26 @@
XBZRLECacheStats xbzrle_counters;
+/* used by the search for pages to send */
+struct PageSearchStatus {
+ /* The migration channel used for a specific host page */
+ QEMUFile *pss_channel;
+ /* Last block from where we have sent data */
+ RAMBlock *last_sent_block;
+ /* Current block being searched */
+ RAMBlock *block;
+ /* Current page to search from */
+ unsigned long page;
+ /* Set once we wrap around */
+ bool complete_round;
+ /* Whether we're sending a host page */
+ bool host_page_sending;
+ /* The start/end of current host page. Invalid if host_page_sending==false */
+ unsigned long host_page_start;
+ unsigned long host_page_end;
+};
+typedef struct PageSearchStatus PageSearchStatus;
+
/* struct contains XBZRLE cache and a static page
used by the compression */
static struct {
@@ -162,6 +182,11 @@ out:
return ret;
}
+static bool postcopy_preempt_active(void)
+{
+ return migrate_postcopy_preempt() && migration_in_postcopy();
+}
+
bool ramblock_is_ignored(RAMBlock *block)
{
return !qemu_ram_is_migratable(block) ||
@@ -296,30 +321,17 @@ struct RAMSrcPageRequest {
QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
};
-typedef struct {
- /*
- * Cached ramblock/offset values if preempted. They're only meaningful if
- * preempted==true below.
- */
- RAMBlock *ram_block;
- unsigned long ram_page;
- /*
- * Whether a postcopy preemption just happened. Will be reset after
- * precopy recovered to background migration.
- */
- bool preempted;
-} PostcopyPreemptState;
-
/* State of RAM for migration */
struct RAMState {
- /* QEMUFile used for this migration */
- QEMUFile *f;
+ /*
+ * PageSearchStatus structures for the channels when send pages.
+ * Protected by the bitmap_mutex.
+ */
+ PageSearchStatus pss[RAM_CHANNEL_MAX];
/* UFFD file descriptor, used in 'write-tracking' migration */
int uffdio_fd;
/* Last block that we have visited searching for dirty pages */
RAMBlock *last_seen_block;
- /* Last block from where we have sent data */
- RAMBlock *last_sent_block;
/* Last dirty target page we have sent */
ram_addr_t last_page;
/* last ram version we have seen */
@@ -357,21 +369,18 @@ struct RAMState {
uint64_t target_page_count;
/* number of dirty bits in the bitmap */
uint64_t migration_dirty_pages;
- /* Protects modification of the bitmap and migration dirty pages */
+ /*
+ * Protects:
+ * - dirty/clear bitmap
+ * - migration_dirty_pages
+ * - pss structures
+ */
QemuMutex bitmap_mutex;
/* The RAMBlock used in the last src_page_requests */
RAMBlock *last_req_rb;
/* Queue of outstanding page requests from the destination */
QemuMutex src_page_req_mutex;
QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
-
- /* Postcopy preemption informations */
- PostcopyPreemptState postcopy_preempt_state;
- /*
- * Current channel we're using on src VM. Only valid if postcopy-preempt
- * is enabled.
- */
- unsigned int postcopy_channel;
};
typedef struct RAMState RAMState;
@@ -379,11 +388,6 @@ static RAMState *ram_state;
static NotifierWithReturnList precopy_notifier_list;
-static void postcopy_preempt_reset(RAMState *rs)
-{
- memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
-}
-
/* Whether postcopy has queued requests? */
static bool postcopy_has_request(RAMState *rs)
{
@@ -420,18 +424,25 @@ uint64_t ram_bytes_remaining(void)
0;
}
+/*
+ * NOTE: not all stats in ram_counters are used in reality. See comments
+ * for struct MigrationAtomicStats. The ultimate result of ram migration
+ * counters will be a merged version with both ram_counters and the atomic
+ * fields in ram_atomic_counters.
+ */
MigrationStats ram_counters;
+MigrationAtomicStats ram_atomic_counters;
-static void ram_transferred_add(uint64_t bytes)
+void ram_transferred_add(uint64_t bytes)
{
if (runstate_is_running()) {
ram_counters.precopy_bytes += bytes;
} else if (migration_in_postcopy()) {
- ram_counters.postcopy_bytes += bytes;
+ stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
} else {
ram_counters.downtime_bytes += bytes;
}
- ram_counters.transferred += bytes;
+ stat64_add(&ram_atomic_counters.transferred, bytes);
}
void dirty_sync_missed_zero_copy(void)
@@ -439,39 +450,6 @@ void dirty_sync_missed_zero_copy(void)
ram_counters.dirty_sync_missed_zero_copy++;
}
-/* used by the search for pages to send */
-struct PageSearchStatus {
- /* Current block being searched */
- RAMBlock *block;
- /* Current page to search from */
- unsigned long page;
- /* Set once we wrap around */
- bool complete_round;
- /*
- * [POSTCOPY-ONLY] Whether current page is explicitly requested by
- * postcopy. When set, the request is "urgent" because the dest QEMU
- * threads are waiting for us.
- */
- bool postcopy_requested;
- /*
- * [POSTCOPY-ONLY] The target channel to use to send current page.
- *
- * Note: This may _not_ match with the value in postcopy_requested
- * above. Let's imagine the case where the postcopy request is exactly
- * the page that we're sending in progress during precopy. In this case
- * we'll have postcopy_requested set to true but the target channel
- * will be the precopy channel (so that we don't split brain on that
- * specific page since the precopy channel already contains partial of
- * that page data).
- *
- * Besides that specific use case, postcopy_target_channel should
- * always be equal to postcopy_requested, because by default we send
- * postcopy pages via postcopy preempt channel.
- */
- bool postcopy_target_channel;
-};
-typedef struct PageSearchStatus PageSearchStatus;
-
CompressionStats compression_counters;
struct CompressParam {
@@ -517,11 +495,28 @@ static QemuThread *decompress_threads;
static QemuMutex decomp_done_lock;
static QemuCond decomp_done_cond;
+static int ram_save_host_page_urgent(PageSearchStatus *pss);
+
static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
ram_addr_t offset, uint8_t *source_buf);
-static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
- bool postcopy_requested);
+/* NOTE: page is the PFN not real ram_addr_t. */
+static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
+{
+ pss->block = rb;
+ pss->page = page;
+ pss->complete_round = false;
+}
+
+/*
+ * Check whether two PSSs are actively sending the same page. Return true
+ * if it is, false otherwise.
+ */
+static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
+{
+ return pss1->host_page_sending && pss2->host_page_sending &&
+ (pss1->host_page_start == pss2->host_page_start);
+}
static void *do_data_compress(void *opaque)
{
@@ -647,28 +642,30 @@ exit:
*
* Returns the number of bytes written
*
- * @f: QEMUFile where to send the data
+ * @pss: current PSS channel status
* @block: block that contains the page we want to send
* @offset: offset inside the block for the page
* in the lower bits, it contains flags
*/
-static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
+static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
ram_addr_t offset)
{
size_t size, len;
+ bool same_block = (block == pss->last_sent_block);
+ QEMUFile *f = pss->pss_channel;
- if (block == rs->last_sent_block) {
+ if (same_block) {
offset |= RAM_SAVE_FLAG_CONTINUE;
}
qemu_put_be64(f, offset);
size = 8;
- if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
+ if (!same_block) {
len = strlen(block->idstr);
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t *)block->idstr, len);
size += 1 + len;
- rs->last_sent_block = block;
+ pss->last_sent_block = block;
}
return size;
}
@@ -719,7 +716,7 @@ void mig_throttle_counter_reset(void)
rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
rs->num_dirty_pages_period = 0;
- rs->bytes_xfer_prev = ram_counters.transferred;
+ rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
}
/**
@@ -736,10 +733,6 @@ void mig_throttle_counter_reset(void)
*/
static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
{
- if (!rs->xbzrle_enabled) {
- return;
- }
-
/* We don't care if this fails to allocate a new cache page
* as long as it updated an old one */
cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
@@ -756,17 +749,19 @@ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
* -1 means that xbzrle would be longer than normal
*
* @rs: current RAM state
+ * @pss: current PSS channel
* @current_data: pointer to the address of the page contents
* @current_addr: addr of the page
* @block: block that contains the page we want to send
* @offset: offset inside the block for the page
*/
-static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
- ram_addr_t current_addr, RAMBlock *block,
- ram_addr_t offset)
+static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
+ uint8_t **current_data, ram_addr_t current_addr,
+ RAMBlock *block, ram_addr_t offset)
{
int encoded_len = 0, bytes_xbzrle;
uint8_t *prev_cached_page;
+ QEMUFile *file = pss->pss_channel;
if (!cache_is_cached(XBZRLE.cache, current_addr,
ram_counters.dirty_sync_count)) {
@@ -831,11 +826,11 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
}
/* Send XBZRLE based compressed page */
- bytes_xbzrle = save_page_header(rs, rs->f, block,
+ bytes_xbzrle = save_page_header(pss, block,
offset | RAM_SAVE_FLAG_XBZRLE);
- qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
- qemu_put_be16(rs->f, encoded_len);
- qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
+ qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
+ qemu_put_be16(file, encoded_len);
+ qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
bytes_xbzrle += encoded_len + 1 + 2;
/*
* Like compressed_size (please see update_compress_thread_counts),
@@ -849,26 +844,38 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
}
/**
- * migration_bitmap_find_dirty: find the next dirty page from start
+ * pss_find_next_dirty: find the next dirty page of current ramblock
*
- * Returns the page offset within memory region of the start of a dirty page
+ * This function updates pss->page to point to the next dirty page index
+ * within the ramblock to migrate, or the end of ramblock when nothing
+ * found. Note that when pss->host_page_sending==true it means we're
+ * during sending a host page, so we won't look for dirty page that is
+ * outside the host page boundary.
*
- * @rs: current RAM state
- * @rb: RAMBlock where to search for dirty pages
- * @start: page where we start the search
+ * @pss: the current page search status
*/
-static inline
-unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
- unsigned long start)
+static void pss_find_next_dirty(PageSearchStatus *pss)
{
+ RAMBlock *rb = pss->block;
unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
unsigned long *bitmap = rb->bmap;
if (ramblock_is_ignored(rb)) {
- return size;
+ /* Points directly to the end, so we know no dirty page */
+ pss->page = size;
+ return;
+ }
+
+ /*
+ * If during sending a host page, only look for dirty pages within the
+ * current host page being send.
+ */
+ if (pss->host_page_sending) {
+ assert(pss->host_page_end);
+ size = MIN(size, pss->host_page_end);
}
- return find_next_bit(bitmap, size, start);
+ pss->page = find_next_bit(bitmap, size, pss->page);
}
static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
@@ -1083,8 +1090,9 @@ uint64_t ram_pagesize_summary(void)
uint64_t ram_get_total_transferred_pages(void)
{
- return ram_counters.normal + ram_counters.duplicate +
- compression_counters.pages + xbzrle_counters.pages;
+ return stat64_get(&ram_atomic_counters.normal) +
+ stat64_get(&ram_atomic_counters.duplicate) +
+ compression_counters.pages + xbzrle_counters.pages;
}
static void migration_update_rates(RAMState *rs, int64_t end_time)
@@ -1143,8 +1151,8 @@ static void migration_trigger_throttle(RAMState *rs)
{
MigrationState *s = migrate_get_current();
uint64_t threshold = s->parameters.throttle_trigger_threshold;
-
- uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
+ uint64_t bytes_xfer_period =
+ stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
@@ -1207,7 +1215,7 @@ static void migration_bitmap_sync(RAMState *rs)
/* reset period counters */
rs->time_last_bitmap_sync = end_time;
rs->num_dirty_pages_period = 0;
- rs->bytes_xfer_prev = ram_counters.transferred;
+ rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
}
if (migrate_use_events()) {
qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
@@ -1234,7 +1242,7 @@ static void migration_bitmap_sync_precopy(RAMState *rs)
}
}
-static void ram_release_page(const char *rbname, uint64_t offset)
+void ram_release_page(const char *rbname, uint64_t offset)
{
if (!migrate_release_ram() || !migration_in_postcopy()) {
return;
@@ -1249,19 +1257,19 @@ static void ram_release_page(const char *rbname, uint64_t offset)
* Returns the size of data written to the file, 0 means the page is not
* a zero page
*
- * @rs: current RAM state
- * @file: the file where the data is saved
+ * @pss: current PSS channel
* @block: block that contains the page we want to send
* @offset: offset inside the block for the page
*/
-static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
+static int save_zero_page_to_file(PageSearchStatus *pss,
RAMBlock *block, ram_addr_t offset)
{
uint8_t *p = block->host + offset;
+ QEMUFile *file = pss->pss_channel;
int len = 0;
if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
- len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
+ len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
qemu_put_byte(file, 0);
len += 1;
ram_release_page(block->idstr, offset);
@@ -1274,16 +1282,17 @@ static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
*
* Returns the number of pages written.
*
- * @rs: current RAM state
+ * @pss: current PSS channel
* @block: block that contains the page we want to send
* @offset: offset inside the block for the page
*/
-static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
+static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
+ ram_addr_t offset)
{
- int len = save_zero_page_to_file(rs, rs->f, block, offset);
+ int len = save_zero_page_to_file(pss, block, offset);
if (len) {
- ram_counters.duplicate++;
+ stat64_add(&ram_atomic_counters.duplicate, 1);
ram_transferred_add(len);
return 1;
}
@@ -1297,15 +1306,15 @@ static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
*
* Return true if the pages has been saved, otherwise false is returned.
*/
-static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
- int *pages)
+static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
+ ram_addr_t offset, int *pages)
{
uint64_t bytes_xmit = 0;
int ret;
*pages = -1;
- ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
- &bytes_xmit);
+ ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
+ TARGET_PAGE_SIZE, &bytes_xmit);
if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
return false;
}
@@ -1320,9 +1329,9 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
}
if (bytes_xmit > 0) {
- ram_counters.normal++;
+ stat64_add(&ram_atomic_counters.normal, 1);
} else if (bytes_xmit == 0) {
- ram_counters.duplicate++;
+ stat64_add(&ram_atomic_counters.duplicate, 1);
}
return true;
@@ -1333,26 +1342,28 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
*
* Returns the number of pages written.
*
- * @rs: current RAM state
+ * @pss: current PSS channel
* @block: block that contains the page we want to send
* @offset: offset inside the block for the page
* @buf: the page to be sent
* @async: send to page asyncly
*/
-static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
- uint8_t *buf, bool async)
+static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
+ ram_addr_t offset, uint8_t *buf, bool async)
{
- ram_transferred_add(save_page_header(rs, rs->f, block,
+ QEMUFile *file = pss->pss_channel;
+
+ ram_transferred_add(save_page_header(pss, block,
offset | RAM_SAVE_FLAG_PAGE));
if (async) {
- qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
+ qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
migrate_release_ram() &&
migration_in_postcopy());
} else {
- qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
+ qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
}
ram_transferred_add(TARGET_PAGE_SIZE);
- ram_counters.normal++;
+ stat64_add(&ram_atomic_counters.normal, 1);
return 1;
}
@@ -1382,8 +1393,8 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
XBZRLE_cache_lock();
if (rs->xbzrle_enabled && !migration_in_postcopy()) {
- pages = save_xbzrle_page(rs, &p, current_addr, block,
- offset);
+ pages = save_xbzrle_page(rs, pss, &p, current_addr,
+ block, offset);
if (!rs->last_stage) {
/* Can't send this cached data async, since the cache page
* might get updated before it gets to the wire
@@ -1394,7 +1405,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
/* XBZRLE overflow or normal page */
if (pages == -1) {
- pages = save_normal_page(rs, block, offset, p, send_async);
+ pages = save_normal_page(pss, block, offset, p, send_async);
}
XBZRLE_cache_unlock();
@@ -1402,13 +1413,13 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
return pages;
}
-static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
+static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
ram_addr_t offset)
{
- if (multifd_queue_page(rs->f, block, offset) < 0) {
+ if (multifd_queue_page(file, block, offset) < 0) {
return -1;
}
- ram_counters.normal++;
+ stat64_add(&ram_atomic_counters.normal, 1);
return 1;
}
@@ -1417,14 +1428,15 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
ram_addr_t offset, uint8_t *source_buf)
{
RAMState *rs = ram_state;
+ PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
uint8_t *p = block->host + offset;
int ret;
- if (save_zero_page_to_file(rs, f, block, offset)) {
+ if (save_zero_page_to_file(pss, block, offset)) {
return true;
}
- save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
+ save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
/*
* copy it to a internal buffer to avoid it being modified by VM
@@ -1446,7 +1458,7 @@ update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
ram_transferred_add(bytes_xmit);
if (param->zero_page) {
- ram_counters.duplicate++;
+ stat64_add(&ram_atomic_counters.duplicate, 1);
return;
}
@@ -1459,6 +1471,7 @@ static bool save_page_use_compression(RAMState *rs);
static void flush_compressed_data(RAMState *rs)
{
+ MigrationState *ms = migrate_get_current();
int idx, len, thread_count;
if (!save_page_use_compression(rs)) {
@@ -1477,7 +1490,7 @@ static void flush_compressed_data(RAMState *rs)
for (idx = 0; idx < thread_count; idx++) {
qemu_mutex_lock(&comp_param[idx].mutex);
if (!comp_param[idx].quit) {
- len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
+ len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
/*
* it's safe to fetch zero_page without holding comp_done_lock
* as there is no further request submitted to the thread,
@@ -1496,11 +1509,11 @@ static inline void set_compress_params(CompressParam *param, RAMBlock *block,
param->offset = offset;
}
-static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
- ram_addr_t offset)
+static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
{
int idx, thread_count, bytes_xmit = -1, pages = -1;
bool wait = migrate_compress_wait_thread();
+ MigrationState *ms = migrate_get_current();
thread_count = migrate_compress_threads();
qemu_mutex_lock(&comp_done_lock);
@@ -1508,7 +1521,8 @@ retry:
for (idx = 0; idx < thread_count; idx++) {
if (comp_param[idx].done) {
comp_param[idx].done = false;
- bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
+ bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
+ comp_param[idx].file);
qemu_mutex_lock(&comp_param[idx].mutex);
set_compress_params(&comp_param[idx], block, offset);
qemu_cond_signal(&comp_param[idx].cond);
@@ -1544,14 +1558,9 @@ retry:
*/
static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
{
- /*
- * This is not a postcopy requested page, mark it "not urgent", and use
- * precopy channel to send it.
- */
- pss->postcopy_requested = false;
- pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
+ /* Update pss->page for the next dirty bit in ramblock */
+ pss_find_next_dirty(pss);
- pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
if (pss->complete_round && pss->block == rs->last_seen_block &&
pss->page >= rs->last_page) {
/*
@@ -1696,7 +1705,7 @@ static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
/* Flush async buffers before un-protect. */
- qemu_fflush(rs->f);
+ qemu_fflush(pss->pss_channel);
/* Un-protect memory range. */
res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
false, false);
@@ -1999,55 +2008,6 @@ void ram_write_tracking_stop(void)
}
#endif /* defined(__linux__) */
-/*
- * Check whether two addr/offset of the ramblock falls onto the same host huge
- * page. Returns true if so, false otherwise.
- */
-static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
- uint64_t addr2)
-{
- size_t page_size = qemu_ram_pagesize(rb);
-
- addr1 = ROUND_DOWN(addr1, page_size);
- addr2 = ROUND_DOWN(addr2, page_size);
-
- return addr1 == addr2;
-}
-
-/*
- * Whether a previous preempted precopy huge page contains current requested
- * page? Returns true if so, false otherwise.
- *
- * This should really happen very rarely, because it means when we were sending
- * during background migration for postcopy we're sending exactly the page that
- * some vcpu got faulted on on dest node. When it happens, we probably don't
- * need to do much but drop the request, because we know right after we restore
- * the precopy stream it'll be serviced. It'll slightly affect the order of
- * postcopy requests to be serviced (e.g. it'll be the same as we move current
- * request to the end of the queue) but it shouldn't be a big deal. The most
- * imporant thing is we can _never_ try to send a partial-sent huge page on the
- * POSTCOPY channel again, otherwise that huge page will got "split brain" on
- * two channels (PRECOPY, POSTCOPY).
- */
-static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
- ram_addr_t offset)
-{
- PostcopyPreemptState *state = &rs->postcopy_preempt_state;
-
- /* No preemption at all? */
- if (!state->preempted) {
- return false;
- }
-
- /* Not even the same ramblock? */
- if (state->ram_block != block) {
- return false;
- }
-
- return offset_on_same_huge_page(block, offset,
- state->ram_page << TARGET_PAGE_BITS);
-}
-
/**
* get_queued_page: unqueue a page from the postcopy requests
*
@@ -2087,20 +2047,7 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
} while (block && !dirty);
- if (block) {
- /* See comment above postcopy_preempted_contains() */
- if (postcopy_preempted_contains(rs, block, offset)) {
- trace_postcopy_preempt_hit(block->idstr, offset);
- /*
- * If what we preempted previously was exactly what we're
- * requesting right now, restore the preempted precopy
- * immediately, boosting its priority as it's requested by
- * postcopy.
- */
- postcopy_preempt_restore(rs, pss, true);
- return true;
- }
- } else {
+ if (!block) {
/*
* Poll write faults too if background snapshot is enabled; that's
* when we have vcpus got blocked by the write protected pages.
@@ -2122,9 +2069,6 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
* really rare.
*/
pss->complete_round = false;
- /* Mark it an urgent request, meanwhile using POSTCOPY channel */
- pss->postcopy_requested = true;
- pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
}
return !!block;
@@ -2202,6 +2146,56 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
return -1;
}
+ /*
+ * When with postcopy preempt, we send back the page directly in the
+ * rp-return thread.
+ */
+ if (postcopy_preempt_active()) {
+ ram_addr_t page_start = start >> TARGET_PAGE_BITS;
+ size_t page_size = qemu_ram_pagesize(ramblock);
+ PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
+ int ret = 0;
+
+ qemu_mutex_lock(&rs->bitmap_mutex);
+
+ pss_init(pss, ramblock, page_start);
+ /*
+ * Always use the preempt channel, and make sure it's there. It's
+ * safe to access without lock, because when rp-thread is running
+ * we should be the only one who operates on the qemufile
+ */
+ pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
+ assert(pss->pss_channel);
+
+ /*
+ * It must be either one or multiple of host page size. Just
+ * assert; if something wrong we're mostly split brain anyway.
+ */
+ assert(len % page_size == 0);
+ while (len) {
+ if (ram_save_host_page_urgent(pss)) {
+ error_report("%s: ram_save_host_page_urgent() failed: "
+ "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
+ __func__, ramblock->idstr, start);
+ ret = -1;
+ break;
+ }
+ /*
+ * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
+ * will automatically be moved and point to the next host page
+ * we're going to send, so no need to update here.
+ *
+ * Normally QEMU never sends >1 host page in requests, so
+ * logically we don't even need that as the loop should only
+ * run once, but just to be consistent.
+ */
+ len -= page_size;
+ };
+ qemu_mutex_unlock(&rs->bitmap_mutex);
+
+ return ret;
+ }
+
struct RAMSrcPageRequest *new_entry =
g_new0(struct RAMSrcPageRequest, 1);
new_entry->rb = ramblock;
@@ -2240,7 +2234,8 @@ static bool save_page_use_compression(RAMState *rs)
* has been properly handled by compression, otherwise needs other
* paths to handle it
*/
-static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
+static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
+ RAMBlock *block, ram_addr_t offset)
{
if (!save_page_use_compression(rs)) {
return false;
@@ -2256,12 +2251,12 @@ static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
* We post the fist page as normal page as compression will take
* much CPU resource.
*/
- if (block != rs->last_sent_block) {
+ if (block != pss->last_sent_block) {
flush_compressed_data(rs);
return false;
}
- if (compress_page_with_multi_thread(rs, block, offset) > 0) {
+ if (compress_page_with_multi_thread(block, offset) > 0) {
return true;
}
@@ -2283,20 +2278,20 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
int res;
- if (control_save_page(rs, block, offset, &res)) {
+ if (control_save_page(pss, block, offset, &res)) {
return res;
}
- if (save_compress_page(rs, block, offset)) {
+ if (save_compress_page(rs, pss, block, offset)) {
return 1;
}
- res = save_zero_page(rs, block, offset);
+ res = save_zero_page(pss, block, offset);
if (res > 0) {
/* Must let xbzrle know, otherwise a previous (now 0'd) cached
* page would be stale
*/
- if (!save_page_use_compression(rs)) {
+ if (rs->xbzrle_enabled) {
XBZRLE_cache_lock();
xbzrle_cache_zero_page(rs, block->offset + offset);
XBZRLE_cache_unlock();
@@ -2311,133 +2306,97 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
* still see partially copied pages which is data corruption.
*/
if (migrate_use_multifd() && !migration_in_postcopy()) {
- return ram_save_multifd_page(rs, block, offset);
+ return ram_save_multifd_page(pss->pss_channel, block, offset);
}
return ram_save_page(rs, pss);
}
-static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
+/* Should be called before sending a host page */
+static void pss_host_page_prepare(PageSearchStatus *pss)
{
- MigrationState *ms = migrate_get_current();
-
- /* Not enabled eager preempt? Then never do that. */
- if (!migrate_postcopy_preempt()) {
- return false;
- }
+ /* How many guest pages are there in one host page? */
+ size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
- /* If the user explicitly disabled breaking of huge page, skip */
- if (!ms->postcopy_preempt_break_huge) {
- return false;
- }
+ pss->host_page_sending = true;
+ pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
+ pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
+}
- /* If the ramblock we're sending is a small page? Never bother. */
- if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
- return false;
- }
+/*
+ * Whether the page pointed by PSS is within the host page being sent.
+ * Must be called after a previous pss_host_page_prepare().
+ */
+static bool pss_within_range(PageSearchStatus *pss)
+{
+ ram_addr_t ram_addr;
- /* Not in postcopy at all? */
- if (!migration_in_postcopy()) {
- return false;
- }
+ assert(pss->host_page_sending);
- /*
- * If we're already handling a postcopy request, don't preempt as this page
- * has got the same high priority.
- */
- if (pss->postcopy_requested) {
+ /* Over host-page boundary? */
+ if (pss->page >= pss->host_page_end) {
return false;
}
- /* If there's postcopy requests, then check it up! */
- return postcopy_has_request(rs);
-}
-
-/* Returns true if we preempted precopy, false otherwise */
-static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
-{
- PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
-
- trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
+ ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
- /*
- * Time to preempt precopy. Cache current PSS into preempt state, so that
- * after handling the postcopy pages we can recover to it. We need to do
- * so because the dest VM will have partial of the precopy huge page kept
- * over in its tmp huge page caches; better move on with it when we can.
- */
- p_state->ram_block = pss->block;
- p_state->ram_page = pss->page;
- p_state->preempted = true;
+ return offset_in_ramblock(pss->block, ram_addr);
}
-/* Whether we're preempted by a postcopy request during sending a huge page */
-static bool postcopy_preempt_triggered(RAMState *rs)
+static void pss_host_page_finish(PageSearchStatus *pss)
{
- return rs->postcopy_preempt_state.preempted;
+ pss->host_page_sending = false;
+ /* This is not needed, but just to reset it */
+ pss->host_page_start = pss->host_page_end = 0;
}
-static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
- bool postcopy_requested)
+/*
+ * Send an urgent host page specified by `pss'. Need to be called with
+ * bitmap_mutex held.
+ *
+ * Returns 0 if save host page succeeded, false otherwise.
+ */
+static int ram_save_host_page_urgent(PageSearchStatus *pss)
{
- PostcopyPreemptState *state = &rs->postcopy_preempt_state;
-
- assert(state->preempted);
+ bool page_dirty, sent = false;
+ RAMState *rs = ram_state;
+ int ret = 0;
- pss->block = state->ram_block;
- pss->page = state->ram_page;
+ trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
+ pss_host_page_prepare(pss);
- /* Whether this is a postcopy request? */
- pss->postcopy_requested = postcopy_requested;
/*
- * When restoring a preempted page, the old data resides in PRECOPY
- * slow channel, even if postcopy_requested is set. So always use
- * PRECOPY channel here.
+ * If precopy is sending the same page, let it be done in precopy, or
+ * we could send the same page in two channels and none of them will
+ * receive the whole page.
*/
- pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
-
- trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
-
- /* Reset preempt state, most importantly, set preempted==false */
- postcopy_preempt_reset(rs);
-}
-
-static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
-{
- MigrationState *s = migrate_get_current();
- unsigned int channel = pss->postcopy_target_channel;
- QEMUFile *next;
-
- if (channel != rs->postcopy_channel) {
- if (channel == RAM_CHANNEL_PRECOPY) {
- next = s->to_dst_file;
- } else {
- next = s->postcopy_qemufile_src;
- }
- /* Update and cache the current channel */
- rs->f = next;
- rs->postcopy_channel = channel;
-
- /*
- * If channel switched, reset last_sent_block since the old sent block
- * may not be on the same channel.
- */
- rs->last_sent_block = NULL;
-
- trace_postcopy_preempt_switch_channel(channel);
+ if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
+ trace_postcopy_preempt_hit(pss->block->idstr,
+ pss->page << TARGET_PAGE_BITS);
+ return 0;
}
- trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
-}
-
-/* We need to make sure rs->f always points to the default channel elsewhere */
-static void postcopy_preempt_reset_channel(RAMState *rs)
-{
- if (migrate_postcopy_preempt() && migration_in_postcopy()) {
- rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
- rs->f = migrate_get_current()->to_dst_file;
- trace_postcopy_preempt_reset_channel();
+ do {
+ page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
+
+ if (page_dirty) {
+ /* Be strict to return code; it must be 1, or what else? */
+ if (ram_save_target_page(rs, pss) != 1) {
+ error_report_once("%s: ram_save_target_page failed", __func__);
+ ret = -1;
+ goto out;
+ }
+ sent = true;
+ }
+ pss_find_next_dirty(pss);
+ } while (pss_within_range(pss));
+out:
+ pss_host_page_finish(pss);
+ /* For urgent requests, flush immediately if sent */
+ if (sent) {
+ qemu_fflush(pss->pss_channel);
}
+ return ret;
}
/**
@@ -2448,9 +2407,14 @@ static void postcopy_preempt_reset_channel(RAMState *rs)
* a host page in which case the remainder of the hostpage is sent.
* Only dirty target pages are sent. Note that the host page size may
* be a huge page for this block.
+ *
* The saving stops at the boundary of the used_length of the block
* if the RAMBlock isn't a multiple of the host page size.
*
+ * The caller must be with ram_state.bitmap_mutex held to call this
+ * function. Note that this function can temporarily release the lock, but
+ * when the function is returned it'll make sure the lock is still held.
+ *
* Returns the number of pages written or negative on error
*
* @rs: current RAM state
@@ -2458,11 +2422,10 @@ static void postcopy_preempt_reset_channel(RAMState *rs)
*/
static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
{
+ bool page_dirty, preempt_active = postcopy_preempt_active();
int tmppages, pages = 0;
size_t pagesize_bits =
qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
- unsigned long hostpage_boundary =
- QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
unsigned long start_page = pss->page;
int res;
@@ -2471,51 +2434,49 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
return 0;
}
- if (migrate_postcopy_preempt() && migration_in_postcopy()) {
- postcopy_preempt_choose_channel(rs, pss);
- }
+ /* Update host page boundary information */
+ pss_host_page_prepare(pss);
do {
- if (postcopy_needs_preempt(rs, pss)) {
- postcopy_do_preempt(rs, pss);
- break;
- }
+ page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
/* Check the pages is dirty and if it is send it */
- if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
- tmppages = ram_save_target_page(rs, pss);
- if (tmppages < 0) {
- return tmppages;
- }
-
- pages += tmppages;
+ if (page_dirty) {
/*
- * Allow rate limiting to happen in the middle of huge pages if
- * something is sent in the current iteration.
+ * Properly yield the lock only in postcopy preempt mode
+ * because both migration thread and rp-return thread can
+ * operate on the bitmaps.
*/
- if (pagesize_bits > 1 && tmppages > 0) {
- migration_rate_limit();
+ if (preempt_active) {
+ qemu_mutex_unlock(&rs->bitmap_mutex);
+ }
+ tmppages = ram_save_target_page(rs, pss);
+ if (tmppages >= 0) {
+ pages += tmppages;
+ /*
+ * Allow rate limiting to happen in the middle of huge pages if
+ * something is sent in the current iteration.
+ */
+ if (pagesize_bits > 1 && tmppages > 0) {
+ migration_rate_limit();
+ }
}
+ if (preempt_active) {
+ qemu_mutex_lock(&rs->bitmap_mutex);
+ }
+ } else {
+ tmppages = 0;
}
- pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
- } while ((pss->page < hostpage_boundary) &&
- offset_in_ramblock(pss->block,
- ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
- /* The offset we leave with is the min boundary of host page and block */
- pss->page = MIN(pss->page, hostpage_boundary);
- /*
- * When with postcopy preempt mode, flush the data as soon as possible for
- * postcopy requests, because we've already sent a whole huge page, so the
- * dst node should already have enough resource to atomically filling in
- * the current missing page.
- *
- * More importantly, when using separate postcopy channel, we must do
- * explicit flush or it won't flush until the buffer is full.
- */
- if (migrate_postcopy_preempt() && pss->postcopy_requested) {
- qemu_fflush(rs->f);
- }
+ if (tmppages < 0) {
+ pss_host_page_finish(pss);
+ return tmppages;
+ }
+
+ pss_find_next_dirty(pss);
+ } while (pss_within_range(pss));
+
+ pss_host_page_finish(pss);
res = ram_save_release_protection(rs, pss, start_page);
return (res < 0 ? res : pages);
@@ -2536,7 +2497,7 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
*/
static int ram_find_and_save_block(RAMState *rs)
{
- PageSearchStatus pss;
+ PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
int pages = 0;
bool again, found;
@@ -2557,35 +2518,24 @@ static int ram_find_and_save_block(RAMState *rs)
rs->last_page = 0;
}
- pss.block = rs->last_seen_block;
- pss.page = rs->last_page;
- pss.complete_round = false;
+ pss_init(pss, rs->last_seen_block, rs->last_page);
do {
again = true;
- found = get_queued_page(rs, &pss);
+ found = get_queued_page(rs, pss);
if (!found) {
- /*
- * Recover previous precopy ramblock/offset if postcopy has
- * preempted precopy. Otherwise find the next dirty bit.
- */
- if (postcopy_preempt_triggered(rs)) {
- postcopy_preempt_restore(rs, &pss, false);
- found = true;
- } else {
- /* priority queue empty, so just search for something dirty */
- found = find_dirty_block(rs, &pss, &again);
- }
+ /* priority queue empty, so just search for something dirty */
+ found = find_dirty_block(rs, pss, &again);
}
if (found) {
- pages = ram_save_host_page(rs, &pss);
+ pages = ram_save_host_page(rs, pss);
}
} while (!pages && again);
- rs->last_seen_block = pss.block;
- rs->last_page = pss.page;
+ rs->last_seen_block = pss->block;
+ rs->last_page = pss->page;
return pages;
}
@@ -2595,9 +2545,9 @@ void acct_update_position(QEMUFile *f, size_t size, bool zero)
uint64_t pages = size / TARGET_PAGE_SIZE;
if (zero) {
- ram_counters.duplicate += pages;
+ stat64_add(&ram_atomic_counters.duplicate, pages);
} else {
- ram_counters.normal += pages;
+ stat64_add(&ram_atomic_counters.normal, pages);
ram_transferred_add(size);
qemu_file_credit_transfer(f, size);
}
@@ -2699,13 +2649,16 @@ static void ram_save_cleanup(void *opaque)
static void ram_state_reset(RAMState *rs)
{
+ int i;
+
+ for (i = 0; i < RAM_CHANNEL_MAX; i++) {
+ rs->pss[i].last_sent_block = NULL;
+ }
+
rs->last_seen_block = NULL;
- rs->last_sent_block = NULL;
rs->last_page = 0;
rs->last_version = ram_list.version;
rs->xbzrle_enabled = false;
- postcopy_preempt_reset(rs);
- rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
}
#define MAX_WAIT 50 /* ms, half buffered_file limit */
@@ -2894,8 +2847,8 @@ void ram_postcopy_send_discard_bitmap(MigrationState *ms)
migration_bitmap_sync(rs);
/* Easiest way to make sure we don't resume in the middle of a host-page */
+ rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
rs->last_seen_block = NULL;
- rs->last_sent_block = NULL;
rs->last_page = 0;
postcopy_each_ram_send_discard(ms);
@@ -3132,7 +3085,7 @@ static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
ram_state_reset(rs);
/* Update RAMState cache of output QEMUFile */
- rs->f = out;
+ rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
trace_ram_state_resume_prepare(pages);
}
@@ -3223,7 +3176,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
return -1;
}
}
- (*rsp)->f = f;
+ (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
WITH_RCU_READ_LOCK_GUARD() {
qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
@@ -3349,8 +3302,6 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
}
qemu_mutex_unlock(&rs->bitmap_mutex);
- postcopy_preempt_reset_channel(rs);
-
/*
* Must occur before EOS (or any QEMUFile operation)
* because of RDMA protocol.
@@ -3360,7 +3311,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
out:
if (ret >= 0
&& migration_is_setup_or_active(migrate_get_current()->state)) {
- ret = multifd_send_sync_main(rs->f);
+ ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
if (ret < 0) {
return ret;
}
@@ -3406,6 +3357,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
/* try transferring iterative blocks of memory */
/* flush all remaining blocks regardless of rate limiting */
+ qemu_mutex_lock(&rs->bitmap_mutex);
while (true) {
int pages;
@@ -3419,6 +3371,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
break;
}
}
+ qemu_mutex_unlock(&rs->bitmap_mutex);
flush_compressed_data(rs);
ram_control_after_iterate(f, RAM_CONTROL_FINISH);
@@ -3428,9 +3381,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
return ret;
}
- postcopy_preempt_reset_channel(rs);
-
- ret = multifd_send_sync_main(rs->f);
+ ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
if (ret < 0) {
return ret;
}
diff --git a/migration/ram.h b/migration/ram.h
index c7af65a..81cbb09 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -32,7 +32,27 @@
#include "qapi/qapi-types-migration.h"
#include "exec/cpu-common.h"
#include "io/channel.h"
+#include "qemu/stats64.h"
+/*
+ * These are the migration statistic counters that need to be updated using
+ * atomic ops (can be accessed by more than one thread). Here since we
+ * cannot modify MigrationStats directly to use Stat64 as it was defined in
+ * the QAPI scheme, we define an internal structure to hold them, and we
+ * propagate the real values when QMP queries happen.
+ *
+ * IOW, the corresponding fields within ram_counters on these specific
+ * fields will be always zero and not being used at all; they're just
+ * placeholders to make it QAPI-compatible.
+ */
+typedef struct {
+ Stat64 transferred;
+ Stat64 duplicate;
+ Stat64 normal;
+ Stat64 postcopy_bytes;
+} MigrationAtomicStats;
+
+extern MigrationAtomicStats ram_atomic_counters;
extern MigrationStats ram_counters;
extern XBZRLECacheStats xbzrle_counters;
extern CompressionStats compression_counters;
@@ -65,6 +85,9 @@ int ram_load_postcopy(QEMUFile *f, int channel);
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
+void ram_transferred_add(uint64_t bytes);
+void ram_release_page(const char *rbname, uint64_t offset);
+
int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr);
bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset);
void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr);