aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2021-10-19 07:41:04 -0700
committerRichard Henderson <richard.henderson@linaro.org>2021-10-19 07:41:04 -0700
commit50352cce138ef3b30c1cda28a4df68fff5da3202 (patch)
treecac0f47717ae21a351c720847fedb41425a10263
parent362534a643b4a34bcb223996538ce9de5cdab946 (diff)
parent911965ace9386e35ca022a65bb45a32fd421af3e (diff)
downloadqemu-50352cce138ef3b30c1cda28a4df68fff5da3202.zip
qemu-50352cce138ef3b30c1cda28a4df68fff5da3202.tar.gz
qemu-50352cce138ef3b30c1cda28a4df68fff5da3202.tar.bz2
Merge remote-tracking branch 'remotes/juanquintela/tags/migration.next-pull-request' into staging
Migration Pull request (3rd try) Hi This should fix all the freebsd problems. Please apply, # gpg: Signature made Tue 19 Oct 2021 02:28:51 AM PDT # gpg: using RSA key 1899FF8EDEBF58CCEE034B82F487EF185872D723 # gpg: Good signature from "Juan Quintela <quintela@redhat.com>" [full] # gpg: aka "Juan Quintela <quintela@trasno.org>" [full] * remotes/juanquintela/tags/migration.next-pull-request: migration/rdma: advise prefetch write for ODP region migration/rdma: Try to register On-Demand Paging memory region migration: allow enabling mutilfd for specific protocol only migration: allow multifd for socket protocol only migration/ram: Don't passs RAMState to migration_clear_memory_region_dirty_bitmap_*() multifd: Unconditionally unregister yank function multifd: Implement yank for multifd send side Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r--meson.build6
-rw-r--r--migration/migration.c12
-rw-r--r--migration/multifd.c35
-rw-r--r--migration/multifd.h4
-rw-r--r--migration/ram.c13
-rw-r--r--migration/rdma.c115
-rw-r--r--migration/trace-events2
7 files changed, 152 insertions, 35 deletions
diff --git a/meson.build b/meson.build
index 5e79467..9ed9a99 100644
--- a/meson.build
+++ b/meson.build
@@ -1530,6 +1530,12 @@ config_host_data.set('HAVE_COPY_FILE_RANGE', cc.has_function('copy_file_range'))
config_host_data.set('HAVE_OPENPTY', cc.has_function('openpty', dependencies: util))
config_host_data.set('HAVE_STRCHRNUL', cc.has_function('strchrnul'))
config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>'))
+if rdma.found()
+ config_host_data.set('HAVE_IBV_ADVISE_MR',
+ cc.has_function('ibv_advise_mr',
+ args: config_host['RDMA_LIBS'].split(),
+ prefix: '#include <infiniband/verbs.h>'))
+endif
# has_header_symbol
config_host_data.set('CONFIG_BYTESWAP_H',
diff --git a/migration/migration.c b/migration/migration.c
index 6ac807e..9172686 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -453,10 +453,12 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp)
{
const char *p = NULL;
+ migrate_protocol_allow_multifd(false); /* reset it anyway */
qapi_event_send_migration(MIGRATION_STATUS_SETUP);
if (strstart(uri, "tcp:", &p) ||
strstart(uri, "unix:", NULL) ||
strstart(uri, "vsock:", NULL)) {
+ migrate_protocol_allow_multifd(true);
socket_start_incoming_migration(p ? p : uri, errp);
#ifdef CONFIG_RDMA
} else if (strstart(uri, "rdma:", &p)) {
@@ -1235,6 +1237,14 @@ static bool migrate_caps_check(bool *cap_list,
}
}
+ /* incoming side only */
+ if (runstate_check(RUN_STATE_INMIGRATE) &&
+ !migrate_multifd_is_allowed() &&
+ cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
+ error_setg(errp, "multifd is not supported by current protocol");
+ return false;
+ }
+
return true;
}
@@ -2280,9 +2290,11 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
}
}
+ migrate_protocol_allow_multifd(false);
if (strstart(uri, "tcp:", &p) ||
strstart(uri, "unix:", NULL) ||
strstart(uri, "vsock:", NULL)) {
+ migrate_protocol_allow_multifd(true);
socket_start_outgoing_migration(s, p ? p : uri, &local_err);
#ifdef CONFIG_RDMA
} else if (strstart(uri, "rdma:", &p)) {
diff --git a/migration/multifd.c b/migration/multifd.c
index 377da78..7c9deb1 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -531,7 +531,7 @@ void multifd_save_cleanup(void)
{
int i;
- if (!migrate_use_multifd()) {
+ if (!migrate_use_multifd() || !migrate_multifd_is_allowed()) {
return;
}
multifd_send_terminate_threads(NULL);
@@ -546,6 +546,9 @@ void multifd_save_cleanup(void)
MultiFDSendParams *p = &multifd_send_state->params[i];
Error *local_err = NULL;
+ if (p->registered_yank) {
+ migration_ioc_unregister_yank(p->c);
+ }
socket_send_channel_destroy(p->c);
p->c = NULL;
qemu_mutex_destroy(&p->mutex);
@@ -813,7 +816,8 @@ static bool multifd_channel_connect(MultiFDSendParams *p,
return false;
}
} else {
- /* update for tls qio channel */
+ migration_ioc_register_yank(ioc);
+ p->registered_yank = true;
p->c = ioc;
qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
QEMU_THREAD_JOINABLE);
@@ -864,6 +868,17 @@ cleanup:
multifd_new_send_channel_cleanup(p, sioc, local_err);
}
+static bool migrate_allow_multifd = true;
+void migrate_protocol_allow_multifd(bool allow)
+{
+ migrate_allow_multifd = allow;
+}
+
+bool migrate_multifd_is_allowed(void)
+{
+ return migrate_allow_multifd;
+}
+
int multifd_save_setup(Error **errp)
{
int thread_count;
@@ -874,6 +889,11 @@ int multifd_save_setup(Error **errp)
if (!migrate_use_multifd()) {
return 0;
}
+ if (!migrate_multifd_is_allowed()) {
+ error_setg(errp, "multifd is not supported by current protocol");
+ return -1;
+ }
+
s = migrate_get_current();
thread_count = migrate_multifd_channels();
multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
@@ -967,7 +987,7 @@ int multifd_load_cleanup(Error **errp)
{
int i;
- if (!migrate_use_multifd()) {
+ if (!migrate_use_multifd() || !migrate_multifd_is_allowed()) {
return 0;
}
multifd_recv_terminate_threads(NULL);
@@ -987,10 +1007,7 @@ int multifd_load_cleanup(Error **errp)
for (i = 0; i < migrate_multifd_channels(); i++) {
MultiFDRecvParams *p = &multifd_recv_state->params[i];
- if (OBJECT(p->c)->ref == 1) {
- migration_ioc_unregister_yank(p->c);
- }
-
+ migration_ioc_unregister_yank(p->c);
object_unref(OBJECT(p->c));
p->c = NULL;
qemu_mutex_destroy(&p->mutex);
@@ -1119,6 +1136,10 @@ int multifd_load_setup(Error **errp)
if (!migrate_use_multifd()) {
return 0;
}
+ if (!migrate_multifd_is_allowed()) {
+ error_setg(errp, "multifd is not supported by current protocol");
+ return -1;
+ }
thread_count = migrate_multifd_channels();
multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
diff --git a/migration/multifd.h b/migration/multifd.h
index 8d6751f..15c50ca 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -13,6 +13,8 @@
#ifndef QEMU_MIGRATION_MULTIFD_H
#define QEMU_MIGRATION_MULTIFD_H
+bool migrate_multifd_is_allowed(void);
+void migrate_protocol_allow_multifd(bool allow);
int multifd_save_setup(Error **errp);
void multifd_save_cleanup(void);
int multifd_load_setup(Error **errp);
@@ -85,6 +87,8 @@ typedef struct {
bool running;
/* should this thread finish */
bool quit;
+ /* is the yank function registered */
+ bool registered_yank;
/* thread has work to do */
int pending_job;
/* array of pages to sent */
diff --git a/migration/ram.c b/migration/ram.c
index 7a43bfd..bb90882 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -789,8 +789,7 @@ unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
return find_next_bit(bitmap, size, start);
}
-static void migration_clear_memory_region_dirty_bitmap(RAMState *rs,
- RAMBlock *rb,
+static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
unsigned long page)
{
uint8_t shift;
@@ -818,8 +817,7 @@ static void migration_clear_memory_region_dirty_bitmap(RAMState *rs,
}
static void
-migration_clear_memory_region_dirty_bitmap_range(RAMState *rs,
- RAMBlock *rb,
+migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
unsigned long start,
unsigned long npages)
{
@@ -832,7 +830,7 @@ migration_clear_memory_region_dirty_bitmap_range(RAMState *rs,
* exclusive.
*/
for (i = chunk_start; i < chunk_end; i += chunk_pages) {
- migration_clear_memory_region_dirty_bitmap(rs, rb, i);
+ migration_clear_memory_region_dirty_bitmap(rb, i);
}
}
@@ -850,7 +848,7 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs,
* the page in the chunk we clear the remote dirty bitmap for all.
* Clearing it earlier won't be a problem, but too late will.
*/
- migration_clear_memory_region_dirty_bitmap(rs, rb, page);
+ migration_clear_memory_region_dirty_bitmap(rb, page);
ret = test_and_clear_bit(page, rb->bmap);
if (ret) {
@@ -2777,8 +2775,7 @@ void qemu_guest_free_page_hint(void *addr, size_t len)
* are initially set. Otherwise those skipped pages will be sent in
* the next round after syncing from the memory region bitmap.
*/
- migration_clear_memory_region_dirty_bitmap_range(ram_state, block,
- start, npages);
+ migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
ram_state->migration_dirty_pages -=
bitmap_count_one_with_offset(block->bmap, start, npages);
bitmap_clear(block->bmap, start, npages);
diff --git a/migration/rdma.c b/migration/rdma.c
index 5c2d113..2a3c788 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1117,19 +1117,82 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
return 0;
}
+/* Check whether On-Demand Paging is supported by RDAM device */
+static bool rdma_support_odp(struct ibv_context *dev)
+{
+ struct ibv_device_attr_ex attr = {0};
+ int ret = ibv_query_device_ex(dev, NULL, &attr);
+ if (ret) {
+ return false;
+ }
+
+ if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * ibv_advise_mr to avoid RNR NAK error as far as possible.
+ * The responder mr registering with ODP will sent RNR NAK back to
+ * the requester in the face of the page fault.
+ */
+static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
+ uint32_t len, uint32_t lkey,
+ const char *name, bool wr)
+{
+#ifdef HAVE_IBV_ADVISE_MR
+ int ret;
+ int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
+ IBV_ADVISE_MR_ADVICE_PREFETCH;
+ struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
+
+ ret = ibv_advise_mr(pd, advice,
+ IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
+ /* ignore the error */
+ if (ret) {
+ trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
+ } else {
+ trace_qemu_rdma_advise_mr(name, len, addr, "successed");
+ }
+#endif
+}
+
static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
{
int i;
RDMALocalBlocks *local = &rdma->local_ram_blocks;
for (i = 0; i < local->nb_blocks; i++) {
+ int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
+
local->block[i].mr =
ibv_reg_mr(rdma->pd,
local->block[i].local_host_addr,
- local->block[i].length,
- IBV_ACCESS_LOCAL_WRITE |
- IBV_ACCESS_REMOTE_WRITE
+ local->block[i].length, access
);
+
+ if (!local->block[i].mr &&
+ errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+ access |= IBV_ACCESS_ON_DEMAND;
+ /* register ODP mr */
+ local->block[i].mr =
+ ibv_reg_mr(rdma->pd,
+ local->block[i].local_host_addr,
+ local->block[i].length, access);
+ trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
+
+ if (local->block[i].mr) {
+ qemu_rdma_advise_prefetch_mr(rdma->pd,
+ (uintptr_t)local->block[i].local_host_addr,
+ local->block[i].length,
+ local->block[i].mr->lkey,
+ local->block[i].block_name,
+ true);
+ }
+ }
+
if (!local->block[i].mr) {
perror("Failed to register local dest ram block!");
break;
@@ -1215,28 +1278,40 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
*/
if (!block->pmr[chunk]) {
uint64_t len = chunk_end - chunk_start;
+ int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
+ 0;
trace_qemu_rdma_register_and_get_keys(len, chunk_start);
- block->pmr[chunk] = ibv_reg_mr(rdma->pd,
- chunk_start, len,
- (rkey ? (IBV_ACCESS_LOCAL_WRITE |
- IBV_ACCESS_REMOTE_WRITE) : 0));
-
- if (!block->pmr[chunk]) {
- perror("Failed to register chunk!");
- fprintf(stderr, "Chunk details: block: %d chunk index %d"
- " start %" PRIuPTR " end %" PRIuPTR
- " host %" PRIuPTR
- " local %" PRIuPTR " registrations: %d\n",
- block->index, chunk, (uintptr_t)chunk_start,
- (uintptr_t)chunk_end, host_addr,
- (uintptr_t)block->local_host_addr,
- rdma->total_registrations);
- return -1;
+ block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+ if (!block->pmr[chunk] &&
+ errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+ access |= IBV_ACCESS_ON_DEMAND;
+ /* register ODP mr */
+ block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+ trace_qemu_rdma_register_odp_mr(block->block_name);
+
+ if (block->pmr[chunk]) {
+ qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
+ len, block->pmr[chunk]->lkey,
+ block->block_name, rkey);
+
+ }
}
- rdma->total_registrations++;
}
+ if (!block->pmr[chunk]) {
+ perror("Failed to register chunk!");
+ fprintf(stderr, "Chunk details: block: %d chunk index %d"
+ " start %" PRIuPTR " end %" PRIuPTR
+ " host %" PRIuPTR
+ " local %" PRIuPTR " registrations: %d\n",
+ block->index, chunk, (uintptr_t)chunk_start,
+ (uintptr_t)chunk_end, host_addr,
+ (uintptr_t)block->local_host_addr,
+ rdma->total_registrations);
+ return -1;
+ }
+ rdma->total_registrations++;
if (lkey) {
*lkey = block->pmr[chunk]->lkey;
diff --git a/migration/trace-events b/migration/trace-events
index a1c0f03..a8ae163 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -212,6 +212,8 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
+qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
+qemu_rdma_advise_mr(const char *name, uint32_t len, uint64_t addr, const char *res) "Try to advise block %s prefetch at %" PRIu32 "@0x%" PRIx64 ": %s"
qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
qemu_rdma_registration_handle_finished(void) ""
qemu_rdma_registration_handle_ram_blocks(void) ""