X-Git-Url: https://repo.jachan.dev/qemu.git/blobdiff_plain/20a519a05a8a135caa4569c3bf5f6a53c4de5f3e..0781c1ed1cbe1361b45f8fddfc85d202a517a88c:/migration/migration.c diff --git a/migration/migration.c b/migration/migration.c index 7087d1abbb..44cbfb0ddd 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -16,32 +16,33 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" #include "qemu/error-report.h" -#include "qemu/main-loop.h" #include "migration/blocker.h" -#include "migration/migration.h" +#include "exec.h" +#include "fd.h" +#include "socket.h" +#include "rdma.h" +#include "ram.h" +#include "migration/global_state.h" +#include "migration/misc.h" +#include "migration.h" #include "savevm.h" #include "qemu-file-channel.h" -#include "migration/qemu-file.h" +#include "qemu-file.h" #include "migration/vmstate.h" -#include "sysemu/sysemu.h" #include "block/block.h" #include "qapi/qmp/qerror.h" -#include "qapi/util.h" -#include "qemu/sockets.h" #include "qemu/rcu.h" -#include "migration/block.h" +#include "block.h" #include "postcopy-ram.h" #include "qemu/thread.h" #include "qmp-commands.h" #include "trace.h" #include "qapi-event.h" -#include "qom/cpu.h" -#include "exec/memory.h" -#include "exec/address-spaces.h" #include "exec/target_page.h" #include "io/channel-buffer.h" -#include "io/channel-tls.h" #include "migration/colo.h" +#include "hw/boards.h" +#include "monitor/monitor.h" #define MAX_THROTTLE (32 << 20) /* Migration transfer speed throttling */ @@ -70,48 +71,78 @@ #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10 /* Migration XBZRLE default cache size */ -#define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024) +#define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024) /* The delay time (in ms) between two COLO checkpoints * Note: Please change this default value to 10000 when we support hybrid mode. */ #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY 200 +#define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2 +#define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 16 static NotifierList migration_state_notifiers = NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); static bool deferred_incoming; +/* Messages sent on the return path from destination to source */ +enum mig_rp_message_type { + MIG_RP_MSG_INVALID = 0, /* Must be 0 */ + MIG_RP_MSG_SHUT, /* sibling will not send any more RP messages */ + MIG_RP_MSG_PONG, /* Response to a PING; data (seq: be32 ) */ + + MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */ + MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */ + + MIG_RP_MSG_MAX +}; + /* When we add fault tolerance, we could have several migrations at once. For now we don't need to add dynamic creation of migration */ -/* For outgoing */ -MigrationState *migrate_get_current(void) +static MigrationState *current_migration; + +static bool migration_object_check(MigrationState *ms, Error **errp); +static int migration_maybe_pause(MigrationState *s, + int *current_active_state, + int new_state); + +void migration_object_init(void) { - static bool once; - static MigrationState current_migration = { - .state = MIGRATION_STATUS_NONE, - .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE, - .mbps = -1, - .parameters = { - .compress_level = DEFAULT_MIGRATE_COMPRESS_LEVEL, - .compress_threads = DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT, - .decompress_threads = DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT, - .cpu_throttle_initial = DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL, - .cpu_throttle_increment = DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT, - .max_bandwidth = MAX_THROTTLE, - .downtime_limit = DEFAULT_MIGRATE_SET_DOWNTIME, - .x_checkpoint_delay = DEFAULT_MIGRATE_X_CHECKPOINT_DELAY, - }, - }; + MachineState *ms = MACHINE(qdev_get_machine()); + Error *err = NULL; - if (!once) { - current_migration.parameters.tls_creds = g_strdup(""); - current_migration.parameters.tls_hostname = g_strdup(""); - once = true; + /* This can only be called once. */ + assert(!current_migration); + current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION)); + + if (!migration_object_check(current_migration, &err)) { + error_report_err(err); + exit(1); + } + + /* + * We cannot really do this in migration_instance_init() since at + * that time global properties are not yet applied, then this + * value will be definitely replaced by something else. + */ + if (ms->enforce_config_section) { + current_migration->send_configuration = true; } - return ¤t_migration; +} + +void migration_object_finalize(void) +{ + object_unref(OBJECT(current_migration)); +} + +/* For outgoing */ +MigrationState *migrate_get_current(void) +{ + /* This can only be called after the object created. */ + assert(current_migration); + return current_migration; } MigrationIncomingState *migration_incoming_get_current(void) @@ -122,7 +153,6 @@ MigrationIncomingState *migration_incoming_get_current(void) if (!once) { mis_current.state = MIGRATION_STATUS_NONE; memset(&mis_current, 0, sizeof(MigrationIncomingState)); - QLIST_INIT(&mis_current.loadvm_handlers); qemu_mutex_init(&mis_current.rp_mutex); qemu_event_init(&mis_current.main_thread_load_event, false); once = true; @@ -134,128 +164,19 @@ void migration_incoming_state_destroy(void) { struct MigrationIncomingState *mis = migration_incoming_get_current(); - qemu_event_destroy(&mis->main_thread_load_event); - loadvm_free_handlers(mis); -} - - -typedef struct { - bool optional; - uint32_t size; - uint8_t runstate[100]; - RunState state; - bool received; -} GlobalState; - -static GlobalState global_state; - -int global_state_store(void) -{ - if (!runstate_store((char *)global_state.runstate, - sizeof(global_state.runstate))) { - error_report("runstate name too big: %s", global_state.runstate); - trace_migrate_state_too_big(); - return -EINVAL; - } - return 0; -} - -void global_state_store_running(void) -{ - const char *state = RunState_lookup[RUN_STATE_RUNNING]; - strncpy((char *)global_state.runstate, - state, sizeof(global_state.runstate)); -} - -static bool global_state_received(void) -{ - return global_state.received; -} - -static RunState global_state_get_runstate(void) -{ - return global_state.state; -} - -void global_state_set_optional(void) -{ - global_state.optional = true; -} - -static bool global_state_needed(void *opaque) -{ - GlobalState *s = opaque; - char *runstate = (char *)s->runstate; - - /* If it is not optional, it is mandatory */ - - if (s->optional == false) { - return true; + if (mis->to_src_file) { + /* Tell source that we are done */ + migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0); + qemu_fclose(mis->to_src_file); + mis->to_src_file = NULL; } - /* If state is running or paused, it is not needed */ - - if (strcmp(runstate, "running") == 0 || - strcmp(runstate, "paused") == 0) { - return false; + if (mis->from_src_file) { + qemu_fclose(mis->from_src_file); + mis->from_src_file = NULL; } - /* for any other state it is needed */ - return true; -} - -static int global_state_post_load(void *opaque, int version_id) -{ - GlobalState *s = opaque; - Error *local_err = NULL; - int r; - char *runstate = (char *)s->runstate; - - s->received = true; - trace_migrate_global_state_post_load(runstate); - - r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE__MAX, - -1, &local_err); - - if (r == -1) { - if (local_err) { - error_report_err(local_err); - } - return -EINVAL; - } - s->state = r; - - return 0; -} - -static void global_state_pre_save(void *opaque) -{ - GlobalState *s = opaque; - - trace_migrate_global_state_pre_save((char *)s->runstate); - s->size = strlen((char *)s->runstate) + 1; -} - -static const VMStateDescription vmstate_globalstate = { - .name = "globalstate", - .version_id = 1, - .minimum_version_id = 1, - .post_load = global_state_post_load, - .pre_save = global_state_pre_save, - .needed = global_state_needed, - .fields = (VMStateField[]) { - VMSTATE_UINT32(size, GlobalState), - VMSTATE_BUFFER(runstate, GlobalState), - VMSTATE_END_OF_LIST() - }, -}; - -void register_global_state(void) -{ - /* We would use it independently that we receive it */ - strcpy((char *)&global_state.runstate, ""); - global_state.received = false; - vmstate_register(NULL, 0, &vmstate_globalstate, &global_state); + qemu_event_reset(&mis->main_thread_load_event); } static void migrate_generate_event(int new_state) @@ -278,6 +199,23 @@ static void deferred_incoming_migration(Error **errp) deferred_incoming = true; } +/* + * Send a message on the return channel back to the source + * of the migration. + */ +static void migrate_send_rp_message(MigrationIncomingState *mis, + enum mig_rp_message_type message_type, + uint16_t len, void *data) +{ + trace_migrate_send_rp_message((int)message_type, len); + qemu_mutex_lock(&mis->rp_mutex); + qemu_put_be16(mis->to_src_file, (unsigned int)message_type); + qemu_put_be16(mis->to_src_file, len); + qemu_put_buffer(mis->to_src_file, data, len); + qemu_fflush(mis->to_src_file); + qemu_mutex_unlock(&mis->rp_mutex); +} + /* Request a range of pages from the source VM at the given * start address. * rbname: Name of the RAMBlock to request the page in, if NULL it's the same @@ -351,6 +289,10 @@ static void process_incoming_migration_bh(void *opaque) */ qemu_announce_self(); + if (multifd_load_cleanup(&local_err) != 0) { + error_report_err(local_err); + autostart = false; + } /* If global state section was not received or we are in running state, we need to obey autostart. Any other state is set with runstate_set. */ @@ -365,7 +307,6 @@ static void process_incoming_migration_bh(void *opaque) } else { runstate_set(global_state_get_runstate()); } - migrate_decompress_threads_join(); /* * This must happen after any state changes since as soon as an external * observer sees this event they might start to prod at the VM assuming @@ -379,17 +320,16 @@ static void process_incoming_migration_bh(void *opaque) static void process_incoming_migration_co(void *opaque) { - QEMUFile *f = opaque; MigrationIncomingState *mis = migration_incoming_get_current(); PostcopyState ps; int ret; - mis->from_src_file = f; + assert(mis->from_src_file); mis->largest_page_size = qemu_ram_pagesize_largest(); postcopy_state_set(POSTCOPY_INCOMING_NONE); migrate_set_state(&mis->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_ACTIVE); - ret = qemu_loadvm_state(f); + ret = qemu_loadvm_state(mis->from_src_file); ps = postcopy_state_get(); trace_process_incoming_migration_co_end(ret, ps); @@ -425,44 +365,69 @@ static void process_incoming_migration_co(void *opaque) } if (ret < 0) { + Error *local_err = NULL; + migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_FAILED); error_report("load of migration failed: %s", strerror(-ret)); - migrate_decompress_threads_join(); + qemu_fclose(mis->from_src_file); + if (multifd_load_cleanup(&local_err) != 0) { + error_report_err(local_err); + } exit(EXIT_FAILURE); } - - qemu_fclose(f); - free_xbzrle_decoded_buf(); - mis->bh = qemu_bh_new(process_incoming_migration_bh, mis); qemu_bh_schedule(mis->bh); } -void migration_fd_process_incoming(QEMUFile *f) +static void migration_incoming_setup(QEMUFile *f) { - Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, f); + MigrationIncomingState *mis = migration_incoming_get_current(); - migrate_decompress_threads_create(); + if (multifd_load_setup() != 0) { + /* We haven't been able to create multifd threads + nothing better to do */ + exit(EXIT_FAILURE); + } + + if (!mis->from_src_file) { + mis->from_src_file = f; + } qemu_file_set_blocking(f, false); +} + +static void migration_incoming_process(void) +{ + Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL); qemu_coroutine_enter(co); } -/* - * Send a message on the return channel back to the source - * of the migration. +void migration_fd_process_incoming(QEMUFile *f) +{ + migration_incoming_setup(f); + migration_incoming_process(); +} + +void migration_ioc_process_incoming(QIOChannel *ioc) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + if (!mis->from_src_file) { + QEMUFile *f = qemu_fopen_channel_input(ioc); + migration_fd_process_incoming(f); + } + /* We still only have a single channel. Nothing to do here yet */ +} + +/** + * @migration_has_all_channels: We have received all channels that we need + * + * Returns true when we have got connections to all the channels that + * we need for migration. */ -void migrate_send_rp_message(MigrationIncomingState *mis, - enum mig_rp_message_type message_type, - uint16_t len, void *data) +bool migration_has_all_channels(void) { - trace_migrate_send_rp_message((int)message_type, len); - qemu_mutex_lock(&mis->rp_mutex); - qemu_put_be16(mis->to_src_file, (unsigned int)message_type); - qemu_put_be16(mis->to_src_file, len); - qemu_put_buffer(mis->to_src_file, data, len); - qemu_fflush(mis->to_src_file); - qemu_mutex_unlock(&mis->rp_mutex); + return true; } /* @@ -506,9 +471,6 @@ MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp) continue; } #endif - if (i == MIGRATION_CAPABILITY_X_COLO && !colo_supported()) { - continue; - } if (head == NULL) { head = g_malloc0(sizeof(*caps)); caps = head; @@ -530,6 +492,7 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) MigrationParameters *params; MigrationState *s = migrate_get_current(); + /* TODO use QAPI_CLONE() instead of duplicating it inline */ params = g_malloc0(sizeof(*params)); params->has_compress_level = true; params->compress_level = s->parameters.compress_level; @@ -541,9 +504,9 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->cpu_throttle_initial = s->parameters.cpu_throttle_initial; params->has_cpu_throttle_increment = true; params->cpu_throttle_increment = s->parameters.cpu_throttle_increment; - params->has_tls_creds = !!s->parameters.tls_creds; + params->has_tls_creds = true; params->tls_creds = g_strdup(s->parameters.tls_creds); - params->has_tls_hostname = !!s->parameters.tls_hostname; + params->has_tls_hostname = true; params->tls_hostname = g_strdup(s->parameters.tls_hostname); params->has_max_bandwidth = true; params->max_bandwidth = s->parameters.max_bandwidth; @@ -553,6 +516,12 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->x_checkpoint_delay = s->parameters.x_checkpoint_delay; params->has_block_incremental = true; params->block_incremental = s->parameters.block_incremental; + params->has_x_multifd_channels = true; + params->x_multifd_channels = s->parameters.x_multifd_channels; + params->has_x_multifd_page_count = true; + params->x_multifd_page_count = s->parameters.x_multifd_page_count; + params->has_xbzrle_cache_size = true; + params->xbzrle_cache_size = s->parameters.xbzrle_cache_size; return params; } @@ -567,6 +536,8 @@ static bool migration_is_setup_or_active(int state) case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_SETUP: + case MIGRATION_STATUS_PRE_SWITCHOVER: + case MIGRATION_STATUS_DEVICE: return true; default: @@ -575,40 +546,53 @@ static bool migration_is_setup_or_active(int state) } } -static void get_xbzrle_cache_stats(MigrationInfo *info) -{ - if (migrate_use_xbzrle()) { - info->has_xbzrle_cache = true; - info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); - info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); - info->xbzrle_cache->bytes = xbzrle_mig_bytes_transferred(); - info->xbzrle_cache->pages = xbzrle_mig_pages_transferred(); - info->xbzrle_cache->cache_miss = xbzrle_mig_pages_cache_miss(); - info->xbzrle_cache->cache_miss_rate = xbzrle_mig_cache_miss_rate(); - info->xbzrle_cache->overflow = xbzrle_mig_pages_overflow(); - } -} - static void populate_ram_info(MigrationInfo *info, MigrationState *s) { info->has_ram = true; info->ram = g_malloc0(sizeof(*info->ram)); - info->ram->transferred = ram_bytes_transferred(); + info->ram->transferred = ram_counters.transferred; info->ram->total = ram_bytes_total(); - info->ram->duplicate = dup_mig_pages_transferred(); + info->ram->duplicate = ram_counters.duplicate; /* legacy value. It is not used anymore */ info->ram->skipped = 0; - info->ram->normal = norm_mig_pages_transferred(); - info->ram->normal_bytes = norm_mig_pages_transferred() * + info->ram->normal = ram_counters.normal; + info->ram->normal_bytes = ram_counters.normal * qemu_target_page_size(); info->ram->mbps = s->mbps; - info->ram->dirty_sync_count = ram_dirty_sync_count(); - info->ram->postcopy_requests = ram_postcopy_requests(); + info->ram->dirty_sync_count = ram_counters.dirty_sync_count; + info->ram->postcopy_requests = ram_counters.postcopy_requests; info->ram->page_size = qemu_target_page_size(); + if (migrate_use_xbzrle()) { + info->has_xbzrle_cache = true; + info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); + info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); + info->xbzrle_cache->bytes = xbzrle_counters.bytes; + info->xbzrle_cache->pages = xbzrle_counters.pages; + info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss; + info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate; + info->xbzrle_cache->overflow = xbzrle_counters.overflow; + } + + if (cpu_throttle_active()) { + info->has_cpu_throttle_percentage = true; + info->cpu_throttle_percentage = cpu_throttle_get_percentage(); + } + if (s->state != MIGRATION_STATUS_COMPLETED) { info->ram->remaining = ram_bytes_remaining(); - info->ram->dirty_pages_rate = ram_dirty_pages_rate(); + info->ram->dirty_pages_rate = ram_counters.dirty_pages_rate; + } +} + +static void populate_disk_info(MigrationInfo *info) +{ + if (blk_mig_active()) { + info->has_disk = true; + info->disk = g_malloc0(sizeof(*info->disk)); + info->disk->transferred = blk_mig_bytes_transferred(); + info->disk->remaining = blk_mig_bytes_remaining(); + info->disk->total = blk_mig_bytes_total(); } } @@ -627,62 +611,27 @@ MigrationInfo *qmp_query_migrate(Error **errp) break; case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_CANCELLING: - info->has_status = true; - info->has_total_time = true; - info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - - s->total_time; - info->has_expected_downtime = true; - info->expected_downtime = s->expected_downtime; - info->has_setup_time = true; - info->setup_time = s->setup_time; - - populate_ram_info(info, s); - - if (blk_mig_active()) { - info->has_disk = true; - info->disk = g_malloc0(sizeof(*info->disk)); - info->disk->transferred = blk_mig_bytes_transferred(); - info->disk->remaining = blk_mig_bytes_remaining(); - info->disk->total = blk_mig_bytes_total(); - } - - if (cpu_throttle_active()) { - info->has_cpu_throttle_percentage = true; - info->cpu_throttle_percentage = cpu_throttle_get_percentage(); - } - - get_xbzrle_cache_stats(info); - break; case MIGRATION_STATUS_POSTCOPY_ACTIVE: - /* Mostly the same as active; TODO add some postcopy stats */ + case MIGRATION_STATUS_PRE_SWITCHOVER: + case MIGRATION_STATUS_DEVICE: + /* TODO add some postcopy stats */ info->has_status = true; info->has_total_time = true; info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - - s->total_time; + - s->start_time; info->has_expected_downtime = true; info->expected_downtime = s->expected_downtime; info->has_setup_time = true; info->setup_time = s->setup_time; populate_ram_info(info, s); - - if (blk_mig_active()) { - info->has_disk = true; - info->disk = g_malloc0(sizeof(*info->disk)); - info->disk->transferred = blk_mig_bytes_transferred(); - info->disk->remaining = blk_mig_bytes_remaining(); - info->disk->total = blk_mig_bytes_total(); - } - - get_xbzrle_cache_stats(info); + populate_disk_info(info); break; case MIGRATION_STATUS_COLO: info->has_status = true; /* TODO: display COLO specific information (checkpoint info etc.) */ break; case MIGRATION_STATUS_COMPLETED: - get_xbzrle_cache_stats(info); - info->has_status = true; info->has_total_time = true; info->total_time = s->total_time; @@ -709,150 +658,277 @@ MigrationInfo *qmp_query_migrate(Error **errp) return info; } -void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, - Error **errp) +/** + * @migration_caps_check - check capability validity + * + * @cap_list: old capability list, array of bool + * @params: new capabilities to be applied soon + * @errp: set *errp if the check failed, with reason + * + * Returns true if check passed, otherwise false. + */ +static bool migrate_caps_check(bool *cap_list, + MigrationCapabilityStatusList *params, + Error **errp) { - MigrationState *s = migrate_get_current(); MigrationCapabilityStatusList *cap; - bool old_postcopy_cap = migrate_postcopy_ram(); + bool old_postcopy_cap; + MigrationIncomingState *mis = migration_incoming_get_current(); - if (migration_is_setup_or_active(s->state)) { - error_setg(errp, QERR_MIGRATION_ACTIVE); - return; - } + old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]; for (cap = params; cap; cap = cap->next) { + cap_list[cap->value->capability] = cap->value->state; + } + #ifndef CONFIG_LIVE_BLOCK_MIGRATION - if (cap->value->capability == MIGRATION_CAPABILITY_BLOCK - && cap->value->state) { - error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) " - "block migration"); - error_append_hint(errp, "Use drive_mirror+NBD instead.\n"); - continue; - } -#endif - if (cap->value->capability == MIGRATION_CAPABILITY_X_COLO) { - if (!colo_supported()) { - error_setg(errp, "COLO is not currently supported, please" - " configure with --enable-colo option in order to" - " support COLO feature"); - continue; - } - } - s->enabled_capabilities[cap->value->capability] = cap->value->state; + if (cap_list[MIGRATION_CAPABILITY_BLOCK]) { + error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) " + "block migration"); + error_append_hint(errp, "Use drive_mirror+NBD instead.\n"); + return false; } +#endif - if (migrate_postcopy_ram()) { - if (migrate_use_compression()) { + if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { + if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) { /* The decompression threads asynchronously write into RAM * rather than use the atomic copies needed to avoid * userfaulting. It should be possible to fix the decompression * threads for compatibility in future. */ - error_report("Postcopy is not currently compatible with " - "compression"); - s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM] = - false; + error_setg(errp, "Postcopy is not currently compatible " + "with compression"); + return false; } + /* This check is reasonably expensive, so only when it's being * set the first time, also it's only the destination that needs * special support. */ if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) && - !postcopy_ram_supported_by_host()) { + !postcopy_ram_supported_by_host(mis)) { /* postcopy_ram_supported_by_host will have emitted a more * detailed message */ - error_report("Postcopy is not supported"); - s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM] = - false; + error_setg(errp, "Postcopy is not supported"); + return false; } } + + return true; } -void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp) +void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, + Error **errp) { MigrationState *s = migrate_get_current(); + MigrationCapabilityStatusList *cap; + + if (migration_is_setup_or_active(s->state)) { + error_setg(errp, QERR_MIGRATION_ACTIVE); + return; + } + + if (!migrate_caps_check(s->enabled_capabilities, params, errp)) { + return; + } + + for (cap = params; cap; cap = cap->next) { + s->enabled_capabilities[cap->value->capability] = cap->value->state; + } +} +/* + * Check whether the parameters are valid. Error will be put into errp + * (if provided). Return true if valid, otherwise false. + */ +static bool migrate_params_check(MigrationParameters *params, Error **errp) +{ if (params->has_compress_level && - (params->compress_level < 0 || params->compress_level > 9)) { + (params->compress_level > 9)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", "is invalid, it should be in the range of 0 to 9"); - return; + return false; } - if (params->has_compress_threads && - (params->compress_threads < 1 || params->compress_threads > 255)) { + + if (params->has_compress_threads && (params->compress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_threads", "is invalid, it should be in the range of 1 to 255"); - return; + return false; } - if (params->has_decompress_threads && - (params->decompress_threads < 1 || params->decompress_threads > 255)) { + + if (params->has_decompress_threads && (params->decompress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "decompress_threads", "is invalid, it should be in the range of 1 to 255"); - return; + return false; } + if (params->has_cpu_throttle_initial && (params->cpu_throttle_initial < 1 || params->cpu_throttle_initial > 99)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu_throttle_initial", "an integer in the range of 1 to 99"); - return; + return false; } + if (params->has_cpu_throttle_increment && (params->cpu_throttle_increment < 1 || params->cpu_throttle_increment > 99)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu_throttle_increment", "an integer in the range of 1 to 99"); - return; + return false; } - if (params->has_max_bandwidth && - (params->max_bandwidth < 0 || params->max_bandwidth > SIZE_MAX)) { + + if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) { error_setg(errp, "Parameter 'max_bandwidth' expects an integer in the" " range of 0 to %zu bytes/second", SIZE_MAX); - return; + return false; } + if (params->has_downtime_limit && - (params->downtime_limit < 0 || - params->downtime_limit > MAX_MIGRATE_DOWNTIME)) { + (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) { error_setg(errp, "Parameter 'downtime_limit' expects an integer in " "the range of 0 to %d milliseconds", MAX_MIGRATE_DOWNTIME); - return; + return false; } - if (params->has_x_checkpoint_delay && (params->x_checkpoint_delay < 0)) { + + /* x_checkpoint_delay is now always positive */ + + if (params->has_x_multifd_channels && (params->x_multifd_channels < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, - "x_checkpoint_delay", - "is invalid, it should be positive"); + "multifd_channels", + "is invalid, it should be in the range of 1 to 255"); + return false; + } + if (params->has_x_multifd_page_count && + (params->x_multifd_page_count < 1 || + params->x_multifd_page_count > 10000)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "multifd_page_count", + "is invalid, it should be in the range of 1 to 10000"); + return false; } + if (params->has_xbzrle_cache_size && + (params->xbzrle_cache_size < qemu_target_page_size() || + !is_power_of_2(params->xbzrle_cache_size))) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "xbzrle_cache_size", + "is invalid, it should be bigger than target page size" + " and a power of two"); + return false; + } + + return true; +} + +static void migrate_params_test_apply(MigrateSetParameters *params, + MigrationParameters *dest) +{ + *dest = migrate_get_current()->parameters; + + /* TODO use QAPI_CLONE() instead of duplicating it inline */ + + if (params->has_compress_level) { + dest->compress_level = params->compress_level; + } + + if (params->has_compress_threads) { + dest->compress_threads = params->compress_threads; + } + + if (params->has_decompress_threads) { + dest->decompress_threads = params->decompress_threads; + } + + if (params->has_cpu_throttle_initial) { + dest->cpu_throttle_initial = params->cpu_throttle_initial; + } + + if (params->has_cpu_throttle_increment) { + dest->cpu_throttle_increment = params->cpu_throttle_increment; + } + + if (params->has_tls_creds) { + assert(params->tls_creds->type == QTYPE_QSTRING); + dest->tls_creds = g_strdup(params->tls_creds->u.s); + } + + if (params->has_tls_hostname) { + assert(params->tls_hostname->type == QTYPE_QSTRING); + dest->tls_hostname = g_strdup(params->tls_hostname->u.s); + } + + if (params->has_max_bandwidth) { + dest->max_bandwidth = params->max_bandwidth; + } + + if (params->has_downtime_limit) { + dest->downtime_limit = params->downtime_limit; + } + + if (params->has_x_checkpoint_delay) { + dest->x_checkpoint_delay = params->x_checkpoint_delay; + } + + if (params->has_block_incremental) { + dest->block_incremental = params->block_incremental; + } + if (params->has_x_multifd_channels) { + dest->x_multifd_channels = params->x_multifd_channels; + } + if (params->has_x_multifd_page_count) { + dest->x_multifd_page_count = params->x_multifd_page_count; + } + if (params->has_xbzrle_cache_size) { + dest->xbzrle_cache_size = params->xbzrle_cache_size; + } +} + +static void migrate_params_apply(MigrateSetParameters *params, Error **errp) +{ + MigrationState *s = migrate_get_current(); + + /* TODO use QAPI_CLONE() instead of duplicating it inline */ + if (params->has_compress_level) { s->parameters.compress_level = params->compress_level; } + if (params->has_compress_threads) { s->parameters.compress_threads = params->compress_threads; } + if (params->has_decompress_threads) { s->parameters.decompress_threads = params->decompress_threads; } + if (params->has_cpu_throttle_initial) { s->parameters.cpu_throttle_initial = params->cpu_throttle_initial; } + if (params->has_cpu_throttle_increment) { s->parameters.cpu_throttle_increment = params->cpu_throttle_increment; } + if (params->has_tls_creds) { g_free(s->parameters.tls_creds); - s->parameters.tls_creds = g_strdup(params->tls_creds); + assert(params->tls_creds->type == QTYPE_QSTRING); + s->parameters.tls_creds = g_strdup(params->tls_creds->u.s); } + if (params->has_tls_hostname) { g_free(s->parameters.tls_hostname); - s->parameters.tls_hostname = g_strdup(params->tls_hostname); + assert(params->tls_hostname->type == QTYPE_QSTRING); + s->parameters.tls_hostname = g_strdup(params->tls_hostname->u.s); } + if (params->has_max_bandwidth) { s->parameters.max_bandwidth = params->max_bandwidth; if (s->to_dst_file) { @@ -860,6 +936,7 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp) s->parameters.max_bandwidth / XFER_LIMIT_RATIO); } } + if (params->has_downtime_limit) { s->parameters.downtime_limit = params->downtime_limit; } @@ -870,9 +947,49 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp) colo_checkpoint_notify(s); } } + if (params->has_block_incremental) { s->parameters.block_incremental = params->block_incremental; } + if (params->has_x_multifd_channels) { + s->parameters.x_multifd_channels = params->x_multifd_channels; + } + if (params->has_x_multifd_page_count) { + s->parameters.x_multifd_page_count = params->x_multifd_page_count; + } + if (params->has_xbzrle_cache_size) { + s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; + xbzrle_cache_resize(params->xbzrle_cache_size, errp); + } +} + +void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp) +{ + MigrationParameters tmp; + + /* TODO Rewrite "" to null instead */ + if (params->has_tls_creds + && params->tls_creds->type == QTYPE_QNULL) { + QDECREF(params->tls_creds->u.n); + params->tls_creds->type = QTYPE_QSTRING; + params->tls_creds->u.s = strdup(""); + } + /* TODO Rewrite "" to null instead */ + if (params->has_tls_hostname + && params->tls_hostname->type == QTYPE_QNULL) { + QDECREF(params->tls_hostname->u.n); + params->tls_hostname->type = QTYPE_QSTRING; + params->tls_hostname->u.s = strdup(""); + } + + migrate_params_test_apply(params, &tmp); + + if (!migrate_params_check(&tmp, errp)) { + /* Invalid parameter */ + return; + } + + migrate_params_apply(params, errp); } @@ -902,20 +1019,34 @@ void qmp_migrate_start_postcopy(Error **errp) void migrate_set_state(int *state, int old_state, int new_state) { + assert(new_state < MIGRATION_STATUS__MAX); if (atomic_cmpxchg(state, old_state, new_state) == old_state) { - trace_migrate_set_state(new_state); + trace_migrate_set_state(MigrationStatus_str(new_state)); migrate_generate_event(new_state); } } -void migrate_set_block_enabled(bool value, Error **errp) +static MigrationCapabilityStatusList *migrate_cap_add( + MigrationCapabilityStatusList *list, + MigrationCapability index, + bool state) { MigrationCapabilityStatusList *cap; cap = g_new0(MigrationCapabilityStatusList, 1); cap->value = g_new0(MigrationCapabilityStatus, 1); - cap->value->capability = MIGRATION_CAPABILITY_BLOCK; - cap->value->state = value; + cap->value->capability = index; + cap->value->state = state; + cap->next = list; + + return cap; +} + +void migrate_set_block_enabled(bool value, Error **errp) +{ + MigrationCapabilityStatusList *cap; + + cap = migrate_cap_add(NULL, MIGRATION_CAPABILITY_BLOCK, value); qmp_migrate_set_capabilities(cap, errp); qapi_free_MigrationCapabilityStatusList(cap); } @@ -942,9 +1073,11 @@ static void migrate_fd_cleanup(void *opaque) qemu_bh_delete(s->cleanup_bh); s->cleanup_bh = NULL; - migration_page_queue_free(); + qemu_savevm_state_cleanup(); if (s->to_dst_file) { + Error *local_err = NULL; + trace_migrate_fd_cleanup(); qemu_mutex_unlock_iothread(); if (s->migration_thread_running) { @@ -953,7 +1086,9 @@ static void migrate_fd_cleanup(void *opaque) } qemu_mutex_lock_iothread(); - migrate_compress_threads_join(); + if (multifd_save_cleanup(&local_err) != 0) { + error_report_err(local_err); + } qemu_fclose(s->to_dst_file); s->to_dst_file = NULL; } @@ -966,21 +1101,30 @@ static void migrate_fd_cleanup(void *opaque) MIGRATION_STATUS_CANCELLED); } + if (s->error) { + /* It is used on info migrate. We can't free it */ + error_report_err(error_copy(s->error)); + } notifier_list_notify(&migration_state_notifiers, s); block_cleanup_parameters(s); } +void migrate_set_error(MigrationState *s, const Error *error) +{ + qemu_mutex_lock(&s->error_mutex); + if (!s->error) { + s->error = error_copy(error); + } + qemu_mutex_unlock(&s->error_mutex); +} + void migrate_fd_error(MigrationState *s, const Error *error) { trace_migrate_fd_error(error_get_pretty(error)); assert(s->to_dst_file == NULL); migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); - if (!s->error) { - s->error = error_copy(error); - } - notifier_list_notify(&migration_state_notifiers, s); - block_cleanup_parameters(s); + migrate_set_error(s, error); } static void migrate_fd_cancel(MigrationState *s) @@ -999,6 +1143,10 @@ static void migrate_fd_cancel(MigrationState *s) if (!migration_is_setup_or_active(old_state)) { break; } + /* If the migration is paused, kick it out of the pause */ + if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) { + qemu_sem_post(&s->pause_sem); + } migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING); } while (s->state != MIGRATION_STATUS_CANCELLING); @@ -1022,7 +1170,6 @@ static void migrate_fd_cancel(MigrationState *s) s->block_inactive = false; } } - block_cleanup_parameters(s); } void add_migration_state_change_notifier(Notifier *notify) @@ -1078,6 +1225,8 @@ bool migration_is_idle(void) case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_COLO: + case MIGRATION_STATUS_PRE_SWITCHOVER: + case MIGRATION_STATUS_DEVICE: return false; case MIGRATION_STATUS__MAX: g_assert_not_reached(); @@ -1114,7 +1263,11 @@ MigrationState *migrate_init(void) migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); - s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->total_time = 0; + s->vm_was_running = false; + s->iteration_initial_bytes = 0; + s->threshold_size = 0; return s; } @@ -1122,7 +1275,7 @@ static GSList *migration_blockers; int migrate_add_blocker(Error *reason, Error **errp) { - if (only_migratable) { + if (migrate_get_current()->only_migratable) { error_propagate(errp, error_copy(reason)); error_prepend(errp, "disallowing migration blocker " "(--only_migratable) for: "); @@ -1175,7 +1328,7 @@ bool migration_is_blocked(Error **errp) } if (migration_blockers) { - *errp = error_copy(migration_blockers->data); + error_propagate(errp, error_copy(migration_blockers->data)); return true; } @@ -1257,33 +1410,25 @@ void qmp_migrate_cancel(Error **errp) migrate_fd_cancel(migrate_get_current()); } -void qmp_migrate_set_cache_size(int64_t value, Error **errp) +void qmp_migrate_continue(MigrationStatus state, Error **errp) { MigrationState *s = migrate_get_current(); - int64_t new_size; - - /* Check for truncation */ - if (value != (size_t)value) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "exceeding address space"); - return; - } - - /* Cache should not be larger than guest ram size */ - if (value > ram_bytes_total()) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "exceeds guest ram size "); + if (s->state != state) { + error_setg(errp, "Migration not in expected state: %s", + MigrationStatus_str(s->state)); return; } + qemu_sem_post(&s->pause_sem); +} - new_size = xbzrle_cache_resize(value); - if (new_size < 0) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "is smaller than page size"); - return; - } +void qmp_migrate_set_cache_size(int64_t value, Error **errp) +{ + MigrateSetParameters p = { + .has_xbzrle_cache_size = true, + .xbzrle_cache_size = value, + }; - s->xbzrle_cache_size = new_size; + qmp_migrate_set_parameters(&p, errp); } int64_t qmp_query_migrate_cache_size(Error **errp) @@ -1293,7 +1438,7 @@ int64_t qmp_query_migrate_cache_size(Error **errp) void qmp_migrate_set_speed(int64_t value, Error **errp) { - MigrationParameters p = { + MigrateSetParameters p = { .has_max_bandwidth = true, .max_bandwidth = value, }; @@ -1313,7 +1458,7 @@ void qmp_migrate_set_downtime(double value, Error **errp) value *= 1000; /* Convert to milliseconds */ value = MAX(0, MIN(INT64_MAX, value)); - MigrationParameters p = { + MigrateSetParameters p = { .has_downtime_limit = true, .downtime_limit = value, }; @@ -1339,6 +1484,11 @@ bool migrate_postcopy_ram(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM]; } +bool migrate_postcopy(void) +{ + return migrate_postcopy_ram(); +} + bool migrate_auto_converge(void) { MigrationState *s; @@ -1402,6 +1552,43 @@ bool migrate_use_events(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS]; } +bool migrate_use_multifd(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_X_MULTIFD]; +} + +bool migrate_pause_before_switchover(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[ + MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER]; +} + +int migrate_multifd_channels(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.x_multifd_channels; +} + +int migrate_multifd_page_count(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.x_multifd_page_count; +} + int migrate_use_xbzrle(void) { MigrationState *s; @@ -1417,7 +1604,7 @@ int64_t migrate_xbzrle_cache_size(void) s = migrate_get_current(); - return s->xbzrle_cache_size; + return s->parameters.xbzrle_cache_size; } bool migrate_use_block(void) @@ -1429,6 +1616,15 @@ bool migrate_use_block(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK]; } +bool migrate_use_return_path(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH]; +} + bool migrate_use_block_incremental(void) { MigrationState *s; @@ -1646,28 +1842,36 @@ static int await_return_path_close_on_source(MigrationState *ms) * Switch from normal iteration to postcopy * Returns non-0 on error */ -static int postcopy_start(MigrationState *ms, bool *old_vm_running) +static int postcopy_start(MigrationState *ms) { int ret; QIOChannelBuffer *bioc; QEMUFile *fb; int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); bool restart_block = false; - migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_POSTCOPY_ACTIVE); + int cur_state = MIGRATION_STATUS_ACTIVE; + if (!migrate_pause_before_switchover()) { + migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + } trace_postcopy_start(); qemu_mutex_lock_iothread(); trace_postcopy_start_set_run(); qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - *old_vm_running = runstate_is_running(); global_state_store(); ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); if (ret < 0) { goto fail; } + ret = migration_maybe_pause(ms, &cur_state, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + if (ret < 0) { + goto fail; + } + ret = bdrv_inactivate_all(); if (ret < 0) { goto fail; @@ -1678,7 +1882,7 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) * Cause any non-postcopiable, but iterative devices to * send out their final data. */ - qemu_savevm_state_complete_precopy(ms->to_dst_file, true); + qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false); /* * in Finish migrate and with the io-lock held everything should @@ -1686,9 +1890,11 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) * need to tell the destination to throw any pages it's already received * that are dirty */ - if (ram_postcopy_send_discard_bitmap(ms)) { - error_report("postcopy send discard bitmap failed"); - goto fail; + if (migrate_postcopy_ram()) { + if (ram_postcopy_send_discard_bitmap(ms)) { + error_report("postcopy send discard bitmap failed"); + goto fail; + } } /* @@ -1697,8 +1903,10 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) * wrap their state up here */ qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX); - /* Ping just for debugging, helps line traces up */ - qemu_savevm_send_ping(ms->to_dst_file, 2); + if (migrate_postcopy_ram()) { + /* Ping just for debugging, helps line traces up */ + qemu_savevm_send_ping(ms->to_dst_file, 2); + } /* * While loading the device state we may trigger page transfer @@ -1722,8 +1930,10 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) */ qemu_savevm_send_postcopy_listen(fb); - qemu_savevm_state_complete_precopy(fb, false); - qemu_savevm_send_ping(fb, 3); + qemu_savevm_state_complete_precopy(fb, false, false); + if (migrate_postcopy_ram()) { + qemu_savevm_send_ping(fb, 3); + } qemu_savevm_send_postcopy_run(fb); @@ -1758,11 +1968,13 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) qemu_mutex_unlock_iothread(); - /* - * Although this ping is just for debug, it could potentially be - * used for getting a better measurement of downtime at the source. - */ - qemu_savevm_send_ping(ms->to_dst_file, 4); + if (migrate_postcopy_ram()) { + /* + * Although this ping is just for debug, it could potentially be + * used for getting a better measurement of downtime at the source. + */ + qemu_savevm_send_ping(ms->to_dst_file, 4); + } if (migrate_release_ram()) { ram_postcopy_migrated_memory_release(ms); @@ -1797,40 +2009,72 @@ fail: return -1; } +/** + * migration_maybe_pause: Pause if required to by + * migrate_pause_before_switchover called with the iothread locked + * Returns: 0 on success + */ +static int migration_maybe_pause(MigrationState *s, + int *current_active_state, + int new_state) +{ + if (!migrate_pause_before_switchover()) { + return 0; + } + + /* Since leaving this state is not atomic with posting the semaphore + * it's possible that someone could have issued multiple migrate_continue + * and the semaphore is incorrectly positive at this point; + * the docs say it's undefined to reinit a semaphore that's already + * init'd, so use timedwait to eat up any existing posts. + */ + while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) { + /* This block intentionally left blank */ + } + + qemu_mutex_unlock_iothread(); + migrate_set_state(&s->state, *current_active_state, + MIGRATION_STATUS_PRE_SWITCHOVER); + qemu_sem_wait(&s->pause_sem); + migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, + new_state); + *current_active_state = new_state; + qemu_mutex_lock_iothread(); + + return s->state == new_state ? 0 : -EINVAL; +} + /** * migration_completion: Used by migration_thread when there's not much left. * The caller 'breaks' the loop when this returns. * * @s: Current migration state - * @current_active_state: The migration state we expect to be in - * @*old_vm_running: Pointer to old_vm_running flag - * @*start_time: Pointer to time to update */ -static void migration_completion(MigrationState *s, int current_active_state, - bool *old_vm_running, - int64_t *start_time) +static void migration_completion(MigrationState *s) { int ret; + int current_active_state = s->state; if (s->state == MIGRATION_STATUS_ACTIVE) { qemu_mutex_lock_iothread(); - *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - *old_vm_running = runstate_is_running(); + s->vm_was_running = runstate_is_running(); ret = global_state_store(); if (!ret) { + bool inactivate = !migrate_colo_enabled(); ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - /* - * Don't mark the image with BDRV_O_INACTIVE flag if - * we will go into COLO stage later. - */ - if (ret >= 0 && !migrate_colo_enabled()) { - ret = bdrv_inactivate_all(); + if (ret >= 0) { + ret = migration_maybe_pause(s, ¤t_active_state, + MIGRATION_STATUS_DEVICE); } if (ret >= 0) { qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); - qemu_savevm_state_complete_precopy(s->to_dst_file, false); + ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, + inactivate); + } + if (inactivate && ret >= 0) { s->block_inactive = true; } } @@ -1851,13 +2095,12 @@ static void migration_completion(MigrationState *s, int current_active_state, * cleaning everything else up (since if there are no failures * it will wait for the destination to send it's status in * a SHUT command). - * Postcopy opens rp if enabled (even if it's not avtivated) */ - if (migrate_postcopy_ram()) { + if (s->rp_state.from_dst_file) { int rp_error; - trace_migration_completion_postcopy_end_before_rp(); + trace_migration_return_path_end_before(); rp_error = await_return_path_close_on_source(s); - trace_migration_completion_postcopy_end_after_rp(rp_error); + trace_migration_return_path_end_after(rp_error); if (rp_error) { goto fail_invalidate; } @@ -1903,6 +2146,154 @@ bool migrate_colo_enabled(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO]; } +static void migration_calculate_complete(MigrationState *s) +{ + uint64_t bytes = qemu_ftell(s->to_dst_file); + int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + s->total_time = end_time - s->start_time; + if (!s->downtime) { + /* + * It's still not set, so we are precopy migration. For + * postcopy, downtime is calculated during postcopy_start(). + */ + s->downtime = end_time - s->downtime_start; + } + + if (s->total_time) { + s->mbps = ((double) bytes * 8.0) / s->total_time / 1000; + } +} + +static void migration_update_counters(MigrationState *s, + int64_t current_time) +{ + uint64_t transferred, time_spent; + double bandwidth; + + if (current_time < s->iteration_start_time + BUFFER_DELAY) { + return; + } + + transferred = qemu_ftell(s->to_dst_file) - s->iteration_initial_bytes; + time_spent = current_time - s->iteration_start_time; + bandwidth = (double)transferred / time_spent; + s->threshold_size = bandwidth * s->parameters.downtime_limit; + + s->mbps = (((double) transferred * 8.0) / + ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; + + /* + * if we haven't sent anything, we don't want to + * recalculate. 10000 is a small enough number for our purposes + */ + if (ram_counters.dirty_pages_rate && transferred > 10000) { + s->expected_downtime = ram_counters.dirty_pages_rate * + qemu_target_page_size() / bandwidth; + } + + qemu_file_reset_rate_limit(s->to_dst_file); + + s->iteration_start_time = current_time; + s->iteration_initial_bytes = qemu_ftell(s->to_dst_file); + + trace_migrate_transferred(transferred, time_spent, + bandwidth, s->threshold_size); +} + +/* Migration thread iteration status */ +typedef enum { + MIG_ITERATE_RESUME, /* Resume current iteration */ + MIG_ITERATE_SKIP, /* Skip current iteration */ + MIG_ITERATE_BREAK, /* Break the loop */ +} MigIterateState; + +/* + * Return true if continue to the next iteration directly, false + * otherwise. + */ +static MigIterateState migration_iteration_run(MigrationState *s) +{ + uint64_t pending_size, pend_post, pend_nonpost; + bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; + + qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, + &pend_nonpost, &pend_post); + pending_size = pend_nonpost + pend_post; + + trace_migrate_pending(pending_size, s->threshold_size, + pend_post, pend_nonpost); + + if (pending_size && pending_size >= s->threshold_size) { + /* Still a significant amount to transfer */ + if (migrate_postcopy() && !in_postcopy && + pend_nonpost <= s->threshold_size && + atomic_read(&s->start_postcopy)) { + if (postcopy_start(s)) { + error_report("%s: postcopy failed to start", __func__); + } + return MIG_ITERATE_SKIP; + } + /* Just another iteration step */ + qemu_savevm_state_iterate(s->to_dst_file, + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); + } else { + trace_migration_thread_low_pending(pending_size); + migration_completion(s); + return MIG_ITERATE_BREAK; + } + + return MIG_ITERATE_RESUME; +} + +static void migration_iteration_finish(MigrationState *s) +{ + /* If we enabled cpu throttling for auto-converge, turn it off. */ + cpu_throttle_stop(); + + qemu_mutex_lock_iothread(); + switch (s->state) { + case MIGRATION_STATUS_COMPLETED: + migration_calculate_complete(s); + runstate_set(RUN_STATE_POSTMIGRATE); + break; + + case MIGRATION_STATUS_ACTIVE: + /* + * We should really assert here, but since it's during + * migration, let's try to reduce the usage of assertions. + */ + if (!migrate_colo_enabled()) { + error_report("%s: critical error: calling COLO code without " + "COLO enabled", __func__); + } + migrate_start_colo_process(s); + /* + * Fixme: we will run VM in COLO no matter its old running state. + * After exited COLO, we will keep running. + */ + s->vm_was_running = true; + /* Fallthrough */ + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_CANCELLED: + if (s->vm_was_running) { + vm_start(); + } else { + if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { + runstate_set(RUN_STATE_POSTMIGRATE); + } + } + break; + + default: + /* Should not reach here, but if so, forgive the VM. */ + error_report("%s: Unknown ending state %d", __func__, s->state); + break; + } + qemu_bh_schedule(s->cleanup_bh); + qemu_mutex_unlock_iothread(); +} + /* * Master migration thread on the source VM. * It drives the migration and pumps the data down the outgoing channel. @@ -1910,35 +2301,27 @@ bool migrate_colo_enabled(void) static void *migration_thread(void *opaque) { MigrationState *s = opaque; - /* Used by the bandwidth calcs, updated later */ - int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); - int64_t initial_bytes = 0; - /* - * The final stage happens when the remaining data is smaller than - * this threshold; it's calculated from the requested downtime and - * measured bandwidth - */ - int64_t threshold_size = 0; - int64_t start_time = initial_time; - int64_t end_time; - bool old_vm_running = false; - bool entered_postcopy = false; - /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ - enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; - bool enable_colo = migrate_colo_enabled(); rcu_register_thread(); + s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + qemu_savevm_state_header(s->to_dst_file); - if (migrate_postcopy_ram()) { + /* + * If we opened the return path, we need to make sure dst has it + * opened as well. + */ + if (s->rp_state.from_dst_file) { /* Now tell the dest that it should open its end so it can reply */ qemu_savevm_send_open_return_path(s->to_dst_file); /* And do a ping that will make stuff easier to debug */ qemu_savevm_send_ping(s->to_dst_file, 1); + } + if (migrate_postcopy()) { /* * Tell the destination that we *might* want to do postcopy later; * if the other end can't do postcopy it should fail now, nice and @@ -1947,7 +2330,7 @@ static void *migration_thread(void *opaque) qemu_savevm_send_postcopy_advise(s->to_dst_file); } - qemu_savevm_state_begin(s->to_dst_file); + qemu_savevm_state_setup(s->to_dst_file); s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, @@ -1958,130 +2341,51 @@ static void *migration_thread(void *opaque) while (s->state == MIGRATION_STATUS_ACTIVE || s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { int64_t current_time; - uint64_t pending_size; if (!qemu_file_rate_limit(s->to_dst_file)) { - uint64_t pend_post, pend_nonpost; - - qemu_savevm_state_pending(s->to_dst_file, threshold_size, - &pend_nonpost, &pend_post); - pending_size = pend_nonpost + pend_post; - trace_migrate_pending(pending_size, threshold_size, - pend_post, pend_nonpost); - if (pending_size && pending_size >= threshold_size) { - /* Still a significant amount to transfer */ - - if (migrate_postcopy_ram() && - s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE && - pend_nonpost <= threshold_size && - atomic_read(&s->start_postcopy)) { - - if (!postcopy_start(s, &old_vm_running)) { - current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE; - entered_postcopy = true; - } - - continue; - } - /* Just another iteration step */ - qemu_savevm_state_iterate(s->to_dst_file, entered_postcopy); - } else { - trace_migration_thread_low_pending(pending_size); - migration_completion(s, current_active_state, - &old_vm_running, &start_time); + MigIterateState iter_state = migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { + continue; + } else if (iter_state == MIG_ITERATE_BREAK) { break; } } if (qemu_file_get_error(s->to_dst_file)) { - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); + if (migration_is_setup_or_active(s->state)) { + migrate_set_state(&s->state, s->state, + MIGRATION_STATUS_FAILED); + } trace_migration_thread_file_err(); break; } + current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - if (current_time >= initial_time + BUFFER_DELAY) { - uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) - - initial_bytes; - uint64_t time_spent = current_time - initial_time; - double bandwidth = (double)transferred_bytes / time_spent; - threshold_size = bandwidth * s->parameters.downtime_limit; - - s->mbps = (((double) transferred_bytes * 8.0) / - ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; - - trace_migrate_transferred(transferred_bytes, time_spent, - bandwidth, threshold_size); - /* if we haven't sent anything, we don't want to recalculate - 10000 is a small enough number for our purposes */ - if (ram_dirty_pages_rate() && transferred_bytes > 10000) { - s->expected_downtime = ram_dirty_pages_rate() * - qemu_target_page_size() / bandwidth; - } - qemu_file_reset_rate_limit(s->to_dst_file); - initial_time = current_time; - initial_bytes = qemu_ftell(s->to_dst_file); - } + migration_update_counters(s, current_time); + if (qemu_file_rate_limit(s->to_dst_file)) { /* usleep expects microseconds */ - g_usleep((initial_time + BUFFER_DELAY - current_time)*1000); + g_usleep((s->iteration_start_time + BUFFER_DELAY - + current_time) * 1000); } } trace_migration_thread_after_loop(); - /* If we enabled cpu throttling for auto-converge, turn it off. */ - cpu_throttle_stop(); - end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - - qemu_mutex_lock_iothread(); - /* - * The resource has been allocated by migration will be reused in COLO - * process, so don't release them. - */ - if (!enable_colo) { - qemu_savevm_state_cleanup(); - } - if (s->state == MIGRATION_STATUS_COMPLETED) { - uint64_t transferred_bytes = qemu_ftell(s->to_dst_file); - s->total_time = end_time - s->total_time; - if (!entered_postcopy) { - s->downtime = end_time - start_time; - } - if (s->total_time) { - s->mbps = (((double) transferred_bytes * 8.0) / - ((double) s->total_time)) / 1000; - } - runstate_set(RUN_STATE_POSTMIGRATE); - } else { - if (s->state == MIGRATION_STATUS_ACTIVE && enable_colo) { - migrate_start_colo_process(s); - qemu_savevm_state_cleanup(); - /* - * Fixme: we will run VM in COLO no matter its old running state. - * After exited COLO, we will keep running. - */ - old_vm_running = true; - } - if (old_vm_running && !entered_postcopy) { - vm_start(); - } else { - if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { - runstate_set(RUN_STATE_POSTMIGRATE); - } - } - } - qemu_bh_schedule(s->cleanup_bh); - qemu_mutex_unlock_iothread(); - + migration_iteration_finish(s); rcu_unregister_thread(); return NULL; } -void migrate_fd_connect(MigrationState *s) +void migrate_fd_connect(MigrationState *s, Error *error_in) { s->expected_downtime = s->parameters.downtime_limit; s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); + if (error_in) { + migrate_fd_error(s, error_in); + migrate_fd_cleanup(s); + return; + } qemu_file_set_blocking(s->to_dst_file, true); qemu_file_set_rate_limit(s->to_dst_file, @@ -2091,10 +2395,11 @@ void migrate_fd_connect(MigrationState *s) notifier_list_notify(&migration_state_notifiers, s); /* - * Open the return path; currently for postcopy but other things might - * also want it. + * Open the return path. For postcopy, it is used exclusively. For + * precopy, only if user specified "return-path" capability would + * QEMU uses the return path. */ - if (migrate_postcopy_ram()) { + if (migrate_postcopy_ram() || migrate_use_return_path()) { if (open_return_path_on_source(s)) { error_report("Unable to open return-path for postcopy"); migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, @@ -2104,9 +2409,192 @@ void migrate_fd_connect(MigrationState *s) } } - migrate_compress_threads_create(); + if (multifd_save_setup() != 0) { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + migrate_fd_cleanup(s); + return; + } qemu_thread_create(&s->thread, "live_migration", migration_thread, s, QEMU_THREAD_JOINABLE); s->migration_thread_running = true; } +void migration_global_dump(Monitor *mon) +{ + MigrationState *ms = migrate_get_current(); + + monitor_printf(mon, "globals:\n"); + monitor_printf(mon, "store-global-state: %s\n", + ms->store_global_state ? "on" : "off"); + monitor_printf(mon, "only-migratable: %s\n", + ms->only_migratable ? "on" : "off"); + monitor_printf(mon, "send-configuration: %s\n", + ms->send_configuration ? "on" : "off"); + monitor_printf(mon, "send-section-footer: %s\n", + ms->send_section_footer ? "on" : "off"); +} + +#define DEFINE_PROP_MIG_CAP(name, x) \ + DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false) + +static Property migration_properties[] = { + DEFINE_PROP_BOOL("store-global-state", MigrationState, + store_global_state, true), + DEFINE_PROP_BOOL("only-migratable", MigrationState, only_migratable, false), + DEFINE_PROP_BOOL("send-configuration", MigrationState, + send_configuration, true), + DEFINE_PROP_BOOL("send-section-footer", MigrationState, + send_section_footer, true), + + /* Migration parameters */ + DEFINE_PROP_UINT8("x-compress-level", MigrationState, + parameters.compress_level, + DEFAULT_MIGRATE_COMPRESS_LEVEL), + DEFINE_PROP_UINT8("x-compress-threads", MigrationState, + parameters.compress_threads, + DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT), + DEFINE_PROP_UINT8("x-decompress-threads", MigrationState, + parameters.decompress_threads, + DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT), + DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState, + parameters.cpu_throttle_initial, + DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL), + DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState, + parameters.cpu_throttle_increment, + DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT), + DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, + parameters.max_bandwidth, MAX_THROTTLE), + DEFINE_PROP_UINT64("x-downtime-limit", MigrationState, + parameters.downtime_limit, + DEFAULT_MIGRATE_SET_DOWNTIME), + DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState, + parameters.x_checkpoint_delay, + DEFAULT_MIGRATE_X_CHECKPOINT_DELAY), + DEFINE_PROP_UINT8("x-multifd-channels", MigrationState, + parameters.x_multifd_channels, + DEFAULT_MIGRATE_MULTIFD_CHANNELS), + DEFINE_PROP_UINT32("x-multifd-page-count", MigrationState, + parameters.x_multifd_page_count, + DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT), + DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState, + parameters.xbzrle_cache_size, + DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE), + + /* Migration capabilities */ + DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), + DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL), + DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE), + DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS), + DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS), + DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS), + DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM), + DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO), + DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM), + DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK), + DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH), + DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_X_MULTIFD), + + DEFINE_PROP_END_OF_LIST(), +}; + +static void migration_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->user_creatable = false; + dc->props = migration_properties; +} + +static void migration_instance_finalize(Object *obj) +{ + MigrationState *ms = MIGRATION_OBJ(obj); + MigrationParameters *params = &ms->parameters; + + qemu_mutex_destroy(&ms->error_mutex); + g_free(params->tls_hostname); + g_free(params->tls_creds); + qemu_sem_destroy(&ms->pause_sem); +} + +static void migration_instance_init(Object *obj) +{ + MigrationState *ms = MIGRATION_OBJ(obj); + MigrationParameters *params = &ms->parameters; + + ms->state = MIGRATION_STATUS_NONE; + ms->mbps = -1; + qemu_sem_init(&ms->pause_sem, 0); + qemu_mutex_init(&ms->error_mutex); + + params->tls_hostname = g_strdup(""); + params->tls_creds = g_strdup(""); + + /* Set has_* up only for parameter checks */ + params->has_compress_level = true; + params->has_compress_threads = true; + params->has_decompress_threads = true; + params->has_cpu_throttle_initial = true; + params->has_cpu_throttle_increment = true; + params->has_max_bandwidth = true; + params->has_downtime_limit = true; + params->has_x_checkpoint_delay = true; + params->has_block_incremental = true; + params->has_x_multifd_channels = true; + params->has_x_multifd_page_count = true; + params->has_xbzrle_cache_size = true; +} + +/* + * Return true if check pass, false otherwise. Error will be put + * inside errp if provided. + */ +static bool migration_object_check(MigrationState *ms, Error **errp) +{ + MigrationCapabilityStatusList *head = NULL; + /* Assuming all off */ + bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret; + int i; + + if (!migrate_params_check(&ms->parameters, errp)) { + return false; + } + + for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { + if (ms->enabled_capabilities[i]) { + head = migrate_cap_add(head, i, true); + } + } + + ret = migrate_caps_check(cap_list, head, errp); + + /* It works with head == NULL */ + qapi_free_MigrationCapabilityStatusList(head); + + return ret; +} + +static const TypeInfo migration_type = { + .name = TYPE_MIGRATION, + /* + * NOTE: TYPE_MIGRATION is not really a device, as the object is + * not created using qdev_create(), it is not attached to the qdev + * device tree, and it is never realized. + * + * TODO: Make this TYPE_OBJECT once QOM provides something like + * TYPE_DEVICE's "-global" properties. + */ + .parent = TYPE_DEVICE, + .class_init = migration_class_init, + .class_size = sizeof(MigrationClass), + .instance_size = sizeof(MigrationState), + .instance_init = migration_instance_init, + .instance_finalize = migration_instance_finalize, +}; + +static void register_migration_types(void) +{ + type_register_static(&migration_type); +} + +type_init(register_migration_types);