#include "migration/vmstate.h"
#include "block/block.h"
#include "qapi/error.h"
+#include "qapi/qapi-commands-migration.h"
+#include "qapi/qapi-events-migration.h"
#include "qapi/qmp/qerror.h"
#include "qapi/qmp/qnull.h"
#include "qemu/rcu.h"
#include "block.h"
#include "postcopy-ram.h"
#include "qemu/thread.h"
-#include "qmp-commands.h"
#include "trace.h"
-#include "qapi-event.h"
#include "exec/target_page.h"
#include "io/channel-buffer.h"
#include "migration/colo.h"
MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */
+ MIG_RP_MSG_RECV_BITMAP, /* send recved_bitmap back to source */
+ MIG_RP_MSG_RESUME_ACK, /* tell source that we are ready to resume */
MIG_RP_MSG_MAX
};
dynamic creation of migration */
static MigrationState *current_migration;
+static MigrationIncomingState *current_incoming;
static bool migration_object_check(MigrationState *ms, Error **errp);
static int migration_maybe_pause(MigrationState *s,
assert(!current_migration);
current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
+ /*
+ * Init the migrate incoming object as well no matter whether
+ * we'll use it or not.
+ */
+ assert(!current_incoming);
+ current_incoming = g_new0(MigrationIncomingState, 1);
+ current_incoming->state = MIGRATION_STATUS_NONE;
+ current_incoming->postcopy_remote_fds =
+ g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
+ qemu_mutex_init(¤t_incoming->rp_mutex);
+ qemu_event_init(¤t_incoming->main_thread_load_event, false);
+ qemu_sem_init(¤t_incoming->postcopy_pause_sem_dst, 0);
+ qemu_sem_init(¤t_incoming->postcopy_pause_sem_fault, 0);
+
+ init_dirty_bitmap_incoming_migration();
+
if (!migration_object_check(current_migration, &err)) {
error_report_err(err);
exit(1);
MigrationIncomingState *migration_incoming_get_current(void)
{
- static bool once;
- static MigrationIncomingState mis_current;
-
- if (!once) {
- mis_current.state = MIGRATION_STATUS_NONE;
- memset(&mis_current, 0, sizeof(MigrationIncomingState));
- qemu_mutex_init(&mis_current.rp_mutex);
- qemu_event_init(&mis_current.main_thread_load_event, false);
- once = true;
- }
- return &mis_current;
+ assert(current_incoming);
+ return current_incoming;
}
void migration_incoming_state_destroy(void)
qemu_fclose(mis->from_src_file);
mis->from_src_file = NULL;
}
+ if (mis->postcopy_remote_fds) {
+ g_array_free(mis->postcopy_remote_fds, TRUE);
+ mis->postcopy_remote_fds = NULL;
+ }
qemu_event_reset(&mis->main_thread_load_event);
}
state, we need to obey autostart. Any other state is set with
runstate_set. */
+ dirty_bitmap_mig_before_vm_start();
+
if (!global_state_received() ||
global_state_get_runstate() == RUN_STATE_RUNNING) {
if (autostart) {
qemu_file_set_blocking(f, false);
}
-static void migration_incoming_process(void)
+void migration_incoming_process(void)
{
Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
qemu_coroutine_enter(co);
void migration_fd_process_incoming(QEMUFile *f)
{
- migration_incoming_setup(f);
- migration_incoming_process();
+ MigrationIncomingState *mis = migration_incoming_get_current();
+
+ if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
+ /* Resumed from a paused postcopy migration */
+
+ mis->from_src_file = f;
+ /* Postcopy has standalone thread to do vm load */
+ qemu_file_set_blocking(f, true);
+
+ /* Re-configure the return path */
+ mis->to_src_file = qemu_file_get_return_path(f);
+
+ migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
+ MIGRATION_STATUS_POSTCOPY_RECOVER);
+
+ /*
+ * Here, we only wake up the main loading thread (while the
+ * fault thread will still be waiting), so that we can receive
+ * commands from source now, and answer it if needed. The
+ * fault thread will be woken up afterwards until we are sure
+ * that source is ready to reply to page requests.
+ */
+ qemu_sem_post(&mis->postcopy_pause_sem_dst);
+ } else {
+ /* New incoming migration */
+ migration_incoming_setup(f);
+ migration_incoming_process();
+ }
}
void migration_ioc_process_incoming(QIOChannel *ioc)
if (!mis->from_src_file) {
QEMUFile *f = qemu_fopen_channel_input(ioc);
- migration_fd_process_incoming(f);
+ migration_incoming_setup(f);
+ return;
}
- /* We still only have a single channel. Nothing to do here yet */
+ multifd_recv_new_channel(ioc);
}
/**
*/
bool migration_has_all_channels(void)
{
- return true;
+ bool all_channels;
+
+ all_channels = multifd_recv_all_channels_created();
+
+ return all_channels;
}
/*
migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
}
+void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
+ char *block_name)
+{
+ char buf[512];
+ int len;
+ int64_t res;
+
+ /*
+ * First, we send the header part. It contains only the len of
+ * idstr, and the idstr itself.
+ */
+ len = strlen(block_name);
+ buf[0] = len;
+ memcpy(buf + 1, block_name, len);
+
+ if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
+ error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
+ __func__);
+ return;
+ }
+
+ migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
+
+ /*
+ * Next, we dump the received bitmap to the stream.
+ *
+ * TODO: currently we are safe since we are the only one that is
+ * using the to_src_file handle (fault thread is still paused),
+ * and it's ok even not taking the mutex. However the best way is
+ * to take the lock before sending the message header, and release
+ * the lock after sending the bitmap.
+ */
+ qemu_mutex_lock(&mis->rp_mutex);
+ res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
+ qemu_mutex_unlock(&mis->rp_mutex);
+
+ trace_migrate_send_rp_recv_bitmap(block_name, res);
+}
+
+void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
+{
+ uint32_t buf;
+
+ buf = cpu_to_be32(value);
+ migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
+}
+
MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
{
MigrationCapabilityStatusList *head = NULL;
switch (state) {
case MIGRATION_STATUS_ACTIVE:
case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+ case MIGRATION_STATUS_POSTCOPY_PAUSED:
+ case MIGRATION_STATUS_POSTCOPY_RECOVER:
case MIGRATION_STATUS_SETUP:
case MIGRATION_STATUS_PRE_SWITCHOVER:
case MIGRATION_STATUS_DEVICE:
}
}
-MigrationInfo *qmp_query_migrate(Error **errp)
+static void fill_source_migration_info(MigrationInfo *info)
{
- MigrationInfo *info = g_malloc0(sizeof(*info));
MigrationState *s = migrate_get_current();
switch (s->state) {
case MIGRATION_STATUS_NONE:
/* no migration has happened ever */
+ /* do not overwrite destination migration status */
+ return;
break;
case MIGRATION_STATUS_SETUP:
info->has_status = true;
case MIGRATION_STATUS_POSTCOPY_ACTIVE:
case MIGRATION_STATUS_PRE_SWITCHOVER:
case MIGRATION_STATUS_DEVICE:
+ case MIGRATION_STATUS_POSTCOPY_PAUSED:
+ case MIGRATION_STATUS_POSTCOPY_RECOVER:
/* TODO add some postcopy stats */
info->has_status = true;
info->has_total_time = true;
break;
}
info->status = s->state;
-
- return info;
}
/**
return true;
}
+static void fill_destination_migration_info(MigrationInfo *info)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+
+ switch (mis->state) {
+ case MIGRATION_STATUS_NONE:
+ return;
+ break;
+ case MIGRATION_STATUS_SETUP:
+ case MIGRATION_STATUS_CANCELLING:
+ case MIGRATION_STATUS_CANCELLED:
+ case MIGRATION_STATUS_ACTIVE:
+ case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+ case MIGRATION_STATUS_FAILED:
+ case MIGRATION_STATUS_COLO:
+ info->has_status = true;
+ break;
+ case MIGRATION_STATUS_COMPLETED:
+ info->has_status = true;
+ fill_destination_postcopy_migration_info(info);
+ break;
+ }
+ info->status = mis->state;
+}
+
+MigrationInfo *qmp_query_migrate(Error **errp)
+{
+ MigrationInfo *info = g_malloc0(sizeof(*info));
+
+ fill_destination_migration_info(info);
+ fill_source_migration_info(info);
+
+ return info;
+}
+
void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
Error **errp)
{
MigrationState *s = migrate_get_current();
MigrationCapabilityStatusList *cap;
+ bool cap_list[MIGRATION_CAPABILITY__MAX];
if (migration_is_setup_or_active(s->state)) {
error_setg(errp, QERR_MIGRATION_ACTIVE);
return;
}
- if (!migrate_caps_check(s->enabled_capabilities, params, errp)) {
+ memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list));
+ if (!migrate_caps_check(cap_list, params, errp)) {
return;
}
/* TODO Rewrite "" to null instead */
if (params->has_tls_creds
&& params->tls_creds->type == QTYPE_QNULL) {
- QDECREF(params->tls_creds->u.n);
+ qobject_unref(params->tls_creds->u.n);
params->tls_creds->type = QTYPE_QSTRING;
params->tls_creds->u.s = strdup("");
}
/* TODO Rewrite "" to null instead */
if (params->has_tls_hostname
&& params->tls_hostname->type == QTYPE_QNULL) {
- QDECREF(params->tls_hostname->u.n);
+ qobject_unref(params->tls_hostname->u.n);
params->tls_hostname->type = QTYPE_QSTRING;
params->tls_hostname->u.s = strdup("");
}
{
MigrationState *s = migrate_get_current();
- if (!migrate_postcopy_ram()) {
+ if (!migrate_postcopy()) {
error_setg(errp, "Enable postcopy with migrate_set_capability before"
" the start of migration");
return;
return false;
}
-void qmp_migrate(const char *uri, bool has_blk, bool blk,
- bool has_inc, bool inc, bool has_detach, bool detach,
- Error **errp)
+/* Returns true if continue to migrate, or false if error detected */
+static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
+ bool resume, Error **errp)
{
Error *local_err = NULL;
- MigrationState *s = migrate_get_current();
- const char *p;
+
+ if (resume) {
+ if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
+ error_setg(errp, "Cannot resume if there is no "
+ "paused migration");
+ return false;
+ }
+ /* This is a resume, skip init status */
+ return true;
+ }
if (migration_is_setup_or_active(s->state) ||
s->state == MIGRATION_STATUS_CANCELLING ||
s->state == MIGRATION_STATUS_COLO) {
error_setg(errp, QERR_MIGRATION_ACTIVE);
- return;
+ return false;
}
+
if (runstate_check(RUN_STATE_INMIGRATE)) {
error_setg(errp, "Guest is waiting for an incoming migration");
- return;
+ return false;
}
if (migration_is_blocked(errp)) {
- return;
+ return false;
}
- if ((has_blk && blk) || (has_inc && inc)) {
+ if (blk || blk_inc) {
if (migrate_use_block() || migrate_use_block_incremental()) {
error_setg(errp, "Command options are incompatible with "
"current migration capabilities");
- return;
+ return false;
}
migrate_set_block_enabled(true, &local_err);
if (local_err) {
error_propagate(errp, local_err);
- return;
+ return false;
}
s->must_remove_block_options = true;
}
- if (has_inc && inc) {
+ if (blk_inc) {
migrate_set_block_incremental(s, true);
}
migrate_init(s);
+ return true;
+}
+
+void qmp_migrate(const char *uri, bool has_blk, bool blk,
+ bool has_inc, bool inc, bool has_detach, bool detach,
+ bool has_resume, bool resume, Error **errp)
+{
+ Error *local_err = NULL;
+ MigrationState *s = migrate_get_current();
+ const char *p;
+
+ if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
+ has_resume && resume, errp)) {
+ /* Error detected, put into errp */
+ return;
+ }
+
if (strstart(uri, "tcp:", &p)) {
tcp_start_outgoing_migration(s, p, &local_err);
#ifdef CONFIG_RDMA
"a valid migration protocol");
migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
MIGRATION_STATUS_FAILED);
+ block_cleanup_parameters(s);
return;
}
bool migrate_postcopy(void)
{
- return migrate_postcopy_ram();
+ return migrate_postcopy_ram() || migrate_dirty_bitmaps();
}
bool migrate_auto_converge(void)
return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
}
+bool migrate_postcopy_blocktime(void)
+{
+ MigrationState *s;
+
+ s = migrate_get_current();
+
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
+}
+
bool migrate_use_compression(void)
{
MigrationState *s;
return s->parameters.decompress_threads;
}
+bool migrate_dirty_bitmaps(void)
+{
+ MigrationState *s;
+
+ s = migrate_get_current();
+
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
+}
+
bool migrate_use_events(void)
{
MigrationState *s;
[MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" },
[MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" },
[MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" },
+ [MIG_RP_MSG_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" },
+ [MIG_RP_MSG_RESUME_ACK] = { .len = 4, .name = "RESUME_ACK" },
[MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" },
};
}
}
+/* Return true to retry, false to quit */
+static bool postcopy_pause_return_path_thread(MigrationState *s)
+{
+ trace_postcopy_pause_return_path();
+
+ qemu_sem_wait(&s->postcopy_pause_rp_sem);
+
+ trace_postcopy_pause_return_path_continued();
+
+ return true;
+}
+
+static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
+{
+ RAMBlock *block = qemu_ram_block_by_name(block_name);
+
+ if (!block) {
+ error_report("%s: invalid block name '%s'", __func__, block_name);
+ return -EINVAL;
+ }
+
+ /* Fetch the received bitmap and refresh the dirty bitmap */
+ return ram_dirty_bitmap_reload(s, block);
+}
+
+static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
+{
+ trace_source_return_path_thread_resume_ack(value);
+
+ if (value != MIGRATION_RESUME_ACK_VALUE) {
+ error_report("%s: illegal resume_ack value %"PRIu32,
+ __func__, value);
+ return -1;
+ }
+
+ /* Now both sides are active. */
+ migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
+ MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+ /* Notify send thread that time to continue send pages */
+ qemu_sem_post(&s->rp_state.rp_sem);
+
+ return 0;
+}
+
/*
* Handles messages sent on the return path towards the source VM
*
int res;
trace_source_return_path_thread_entry();
+
+retry:
while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
migration_is_setup_or_active(ms->state)) {
trace_source_return_path_thread_loop_top();
migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
break;
+ case MIG_RP_MSG_RECV_BITMAP:
+ if (header_len < 1) {
+ error_report("%s: missing block name", __func__);
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+ /* Format: len (1B) + idstr (<255B). This ends the idstr. */
+ buf[buf[0] + 1] = '\0';
+ if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+ break;
+
+ case MIG_RP_MSG_RESUME_ACK:
+ tmp32 = ldl_be_p(buf);
+ if (migrate_handle_rp_resume_ack(ms, tmp32)) {
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+ break;
+
default:
break;
}
}
- if (qemu_file_get_error(rp)) {
+
+out:
+ res = qemu_file_get_error(rp);
+ if (res) {
+ if (res == -EIO) {
+ /*
+ * Maybe there is something we can do: it looks like a
+ * network down issue, and we pause for a recovery.
+ */
+ if (postcopy_pause_return_path_thread(ms)) {
+ /* Reload rp, reset the rest */
+ rp = ms->rp_state.from_dst_file;
+ ms->rp_state.error = false;
+ goto retry;
+ }
+ }
+
trace_source_return_path_thread_bad_end();
mark_source_rp_bad(ms);
}
trace_source_return_path_thread_end();
-out:
ms->rp_state.from_dst_file = NULL;
qemu_fclose(rp);
return NULL;
}
-static int open_return_path_on_source(MigrationState *ms)
+static int open_return_path_on_source(MigrationState *ms,
+ bool create_thread)
{
ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
}
trace_open_return_path_on_source();
+
+ if (!create_thread) {
+ /* We're done */
+ return 0;
+ }
+
qemu_thread_create(&ms->rp_state.rp_thread, "return path",
source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
}
+typedef enum MigThrError {
+ /* No error detected */
+ MIG_THR_ERR_NONE = 0,
+ /* Detected error, but resumed successfully */
+ MIG_THR_ERR_RECOVERED = 1,
+ /* Detected fatal error, need to exit */
+ MIG_THR_ERR_FATAL = 2,
+} MigThrError;
+
+static int postcopy_resume_handshake(MigrationState *s)
+{
+ qemu_savevm_send_postcopy_resume(s->to_dst_file);
+
+ while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
+ qemu_sem_wait(&s->rp_state.rp_sem);
+ }
+
+ if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+ return 0;
+ }
+
+ return -1;
+}
+
+/* Return zero if success, or <0 for error */
+static int postcopy_do_resume(MigrationState *s)
+{
+ int ret;
+
+ /*
+ * Call all the resume_prepare() hooks, so that modules can be
+ * ready for the migration resume.
+ */
+ ret = qemu_savevm_state_resume_prepare(s);
+ if (ret) {
+ error_report("%s: resume_prepare() failure detected: %d",
+ __func__, ret);
+ return ret;
+ }
+
+ /*
+ * Last handshake with destination on the resume (destination will
+ * switch to postcopy-active afterwards)
+ */
+ ret = postcopy_resume_handshake(s);
+ if (ret) {
+ error_report("%s: handshake failed: %d", __func__, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * We don't return until we are in a safe state to continue current
+ * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or
+ * MIG_THR_ERR_FATAL if unrecovery failure happened.
+ */
+static MigThrError postcopy_pause(MigrationState *s)
+{
+ assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+ while (true) {
+ migrate_set_state(&s->state, s->state,
+ MIGRATION_STATUS_POSTCOPY_PAUSED);
+
+ /* Current channel is possibly broken. Release it. */
+ assert(s->to_dst_file);
+ qemu_file_shutdown(s->to_dst_file);
+ qemu_fclose(s->to_dst_file);
+ s->to_dst_file = NULL;
+
+ error_report("Detected IO failure for postcopy. "
+ "Migration paused.");
+
+ /*
+ * We wait until things fixed up. Then someone will setup the
+ * status back for us.
+ */
+ while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
+ qemu_sem_wait(&s->postcopy_pause_sem);
+ }
+
+ if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
+ /* Woken up by a recover procedure. Give it a shot */
+
+ /*
+ * Firstly, let's wake up the return path now, with a new
+ * return path channel.
+ */
+ qemu_sem_post(&s->postcopy_pause_rp_sem);
+
+ /* Do the resume logic */
+ if (postcopy_do_resume(s) == 0) {
+ /* Let's continue! */
+ trace_postcopy_pause_continued();
+ return MIG_THR_ERR_RECOVERED;
+ } else {
+ /*
+ * Something wrong happened during the recovery, let's
+ * pause again. Pause is always better than throwing
+ * data away.
+ */
+ continue;
+ }
+ } else {
+ /* This is not right... Time to quit. */
+ return MIG_THR_ERR_FATAL;
+ }
+ }
+}
+
+static MigThrError migration_detect_error(MigrationState *s)
+{
+ int ret;
+
+ /* Try to detect any file errors */
+ ret = qemu_file_get_error(s->to_dst_file);
+
+ if (!ret) {
+ /* Everything is fine */
+ return MIG_THR_ERR_NONE;
+ }
+
+ if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
+ /*
+ * For postcopy, we allow the network to be down for a
+ * while. After that, it can be continued by a
+ * recovery phase.
+ */
+ return postcopy_pause(s);
+ } else {
+ /*
+ * For precopy (or postcopy with error outside IO), we fail
+ * with no time.
+ */
+ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
+ trace_migration_thread_file_err();
+
+ /* Time to stop the migration, now. */
+ return MIG_THR_ERR_FATAL;
+ }
+}
+
static void migration_calculate_complete(MigrationState *s)
{
uint64_t bytes = qemu_ftell(s->to_dst_file);
*/
static MigIterateState migration_iteration_run(MigrationState *s)
{
- uint64_t pending_size, pend_post, pend_nonpost;
+ uint64_t pending_size, pend_pre, pend_compat, pend_post;
bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
- qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
- &pend_nonpost, &pend_post);
- pending_size = pend_nonpost + pend_post;
+ qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
+ &pend_compat, &pend_post);
+ pending_size = pend_pre + pend_compat + pend_post;
trace_migrate_pending(pending_size, s->threshold_size,
- pend_post, pend_nonpost);
+ pend_pre, pend_compat, pend_post);
if (pending_size && pending_size >= s->threshold_size) {
/* Still a significant amount to transfer */
if (migrate_postcopy() && !in_postcopy &&
- pend_nonpost <= s->threshold_size &&
+ pend_pre <= s->threshold_size &&
atomic_read(&s->start_postcopy)) {
if (postcopy_start(s)) {
error_report("%s: postcopy failed to start", __func__);
{
MigrationState *s = opaque;
int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+ MigThrError thr_error;
rcu_register_thread();
}
}
- if (qemu_file_get_error(s->to_dst_file)) {
- if (migration_is_setup_or_active(s->state)) {
- migrate_set_state(&s->state, s->state,
- MIGRATION_STATUS_FAILED);
- }
- trace_migration_thread_file_err();
+ /*
+ * Try to detect any kind of failures, and see whether we
+ * should stop the migration now.
+ */
+ thr_error = migration_detect_error(s);
+ if (thr_error == MIG_THR_ERR_FATAL) {
+ /* Stop migration */
break;
+ } else if (thr_error == MIG_THR_ERR_RECOVERED) {
+ /*
+ * Just recovered from a e.g. network failure, reset all
+ * the local variables. This is important to avoid
+ * breaking transferred_bytes and bandwidth calculation
+ */
+ s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ s->iteration_initial_bytes = 0;
}
current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
void migrate_fd_connect(MigrationState *s, Error *error_in)
{
+ int64_t rate_limit;
+ bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
+
s->expected_downtime = s->parameters.downtime_limit;
s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
if (error_in) {
return;
}
- qemu_file_set_blocking(s->to_dst_file, true);
- qemu_file_set_rate_limit(s->to_dst_file,
- s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
+ if (resume) {
+ /* This is a resumed migration */
+ rate_limit = INT64_MAX;
+ } else {
+ /* This is a fresh new migration */
+ rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
+ s->expected_downtime = s->parameters.downtime_limit;
+ s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
- /* Notify before starting migration thread */
- notifier_list_notify(&migration_state_notifiers, s);
+ /* Notify before starting migration thread */
+ notifier_list_notify(&migration_state_notifiers, s);
+ }
+
+ qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
+ qemu_file_set_blocking(s->to_dst_file, true);
/*
* Open the return path. For postcopy, it is used exclusively. For
* QEMU uses the return path.
*/
if (migrate_postcopy_ram() || migrate_use_return_path()) {
- if (open_return_path_on_source(s)) {
+ if (open_return_path_on_source(s, !resume)) {
error_report("Unable to open return-path for postcopy");
- migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
- MIGRATION_STATUS_FAILED);
+ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
migrate_fd_cleanup(s);
return;
}
}
+ if (resume) {
+ /* Wakeup the main migration thread to do the recovery */
+ migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
+ MIGRATION_STATUS_POSTCOPY_RECOVER);
+ qemu_sem_post(&s->postcopy_pause_sem);
+ return;
+ }
+
if (multifd_save_setup() != 0) {
migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
MIGRATION_STATUS_FAILED);
g_free(params->tls_hostname);
g_free(params->tls_creds);
qemu_sem_destroy(&ms->pause_sem);
+ qemu_sem_destroy(&ms->postcopy_pause_sem);
+ qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
+ qemu_sem_destroy(&ms->rp_state.rp_sem);
+ error_free(ms->error);
}
static void migration_instance_init(Object *obj)
params->has_x_multifd_channels = true;
params->has_x_multifd_page_count = true;
params->has_xbzrle_cache_size = true;
+
+ qemu_sem_init(&ms->postcopy_pause_sem, 0);
+ qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
+ qemu_sem_init(&ms->rp_state.rp_sem, 0);
}
/*