2 * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
3 * (a.k.a. Fault Tolerance or Continuous Replication)
5 * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
6 * Copyright (c) 2016 FUJITSU LIMITED
7 * Copyright (c) 2016 Intel Corporation
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
13 #include "qemu/osdep.h"
14 #include "sysemu/sysemu.h"
15 #include "qapi/error.h"
16 #include "qapi/qapi-commands-migration.h"
17 #include "qemu-file-channel.h"
18 #include "migration.h"
19 #include "qemu-file.h"
21 #include "migration/colo.h"
23 #include "io/channel-buffer.h"
25 #include "qemu/error-report.h"
26 #include "migration/failover.h"
27 #include "replication.h"
28 #include "net/colo-compare.h"
30 #include "block/block.h"
31 #include "qapi/qapi-events-migration.h"
32 #include "qapi/qmp/qerror.h"
33 #include "sysemu/cpus.h"
34 #include "net/filter.h"
36 static bool vmstate_loading;
37 static Notifier packets_compare_notifier;
39 #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
41 bool migration_in_colo_state(void)
43 MigrationState *s = migrate_get_current();
45 return (s->state == MIGRATION_STATUS_COLO);
48 bool migration_incoming_in_colo_state(void)
50 MigrationIncomingState *mis = migration_incoming_get_current();
52 return mis && (mis->state == MIGRATION_STATUS_COLO);
55 static bool colo_runstate_is_stopped(void)
57 return runstate_check(RUN_STATE_COLO) || !runstate_is_running();
60 static void secondary_vm_do_failover(void)
63 MigrationIncomingState *mis = migration_incoming_get_current();
64 Error *local_err = NULL;
66 /* Can not do failover during the process of VM's loading VMstate, Or
67 * it will break the secondary VM.
69 if (vmstate_loading) {
70 old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
71 FAILOVER_STATUS_RELAUNCH);
72 if (old_state != FAILOVER_STATUS_ACTIVE) {
73 error_report("Unknown error while do failover for secondary VM,"
74 "old_state: %s", FailoverStatus_str(old_state));
79 migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
80 MIGRATION_STATUS_COMPLETED);
82 replication_stop_all(true, &local_err);
84 error_report_err(local_err);
87 /* Notify all filters of all NIC to do checkpoint */
88 colo_notify_filters_event(COLO_EVENT_FAILOVER, &local_err);
90 error_report_err(local_err);
94 error_report("\"-S\" qemu option will be ignored in secondary side");
95 /* recover runstate to normal migration finish state */
99 * Make sure COLO incoming thread not block in recv or send,
100 * If mis->from_src_file and mis->to_src_file use the same fd,
101 * The second shutdown() will return -1, we ignore this value,
104 if (mis->from_src_file) {
105 qemu_file_shutdown(mis->from_src_file);
107 if (mis->to_src_file) {
108 qemu_file_shutdown(mis->to_src_file);
111 old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
112 FAILOVER_STATUS_COMPLETED);
113 if (old_state != FAILOVER_STATUS_ACTIVE) {
114 error_report("Incorrect state (%s) while doing failover for "
115 "secondary VM", FailoverStatus_str(old_state));
118 /* Notify COLO incoming thread that failover work is finished */
119 qemu_sem_post(&mis->colo_incoming_sem);
120 /* For Secondary VM, jump to incoming co */
121 if (mis->migration_incoming_co) {
122 qemu_coroutine_enter(mis->migration_incoming_co);
126 static void primary_vm_do_failover(void)
128 MigrationState *s = migrate_get_current();
130 Error *local_err = NULL;
132 migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
133 MIGRATION_STATUS_COMPLETED);
135 * kick COLO thread which might wait at
136 * qemu_sem_wait(&s->colo_checkpoint_sem).
138 colo_checkpoint_notify(migrate_get_current());
141 * Wake up COLO thread which may blocked in recv() or send(),
142 * The s->rp_state.from_dst_file and s->to_dst_file may use the
143 * same fd, but we still shutdown the fd for twice, it is harmless.
145 if (s->to_dst_file) {
146 qemu_file_shutdown(s->to_dst_file);
148 if (s->rp_state.from_dst_file) {
149 qemu_file_shutdown(s->rp_state.from_dst_file);
152 old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
153 FAILOVER_STATUS_COMPLETED);
154 if (old_state != FAILOVER_STATUS_ACTIVE) {
155 error_report("Incorrect state (%s) while doing failover for Primary VM",
156 FailoverStatus_str(old_state));
160 replication_stop_all(true, &local_err);
162 error_report_err(local_err);
166 /* Notify COLO thread that failover work is finished */
167 qemu_sem_post(&s->colo_exit_sem);
170 COLOMode get_colo_mode(void)
172 if (migration_in_colo_state()) {
173 return COLO_MODE_PRIMARY;
174 } else if (migration_incoming_in_colo_state()) {
175 return COLO_MODE_SECONDARY;
177 return COLO_MODE_NONE;
181 void colo_do_failover(MigrationState *s)
183 /* Make sure VM stopped while failover happened. */
184 if (!colo_runstate_is_stopped()) {
185 vm_stop_force_state(RUN_STATE_COLO);
188 if (get_colo_mode() == COLO_MODE_PRIMARY) {
189 primary_vm_do_failover();
191 secondary_vm_do_failover();
195 void qmp_xen_set_replication(bool enable, bool primary,
196 bool has_failover, bool failover,
199 #ifdef CONFIG_REPLICATION
200 ReplicationMode mode = primary ?
201 REPLICATION_MODE_PRIMARY :
202 REPLICATION_MODE_SECONDARY;
204 if (has_failover && enable) {
205 error_setg(errp, "Parameter 'failover' is only for"
206 " stopping replication");
211 replication_start_all(mode, errp);
216 replication_stop_all(failover, failover ? NULL : errp);
223 ReplicationStatus *qmp_query_xen_replication_status(Error **errp)
225 #ifdef CONFIG_REPLICATION
227 ReplicationStatus *s = g_new0(ReplicationStatus, 1);
229 replication_get_error_all(&err);
233 s->desc = g_strdup(error_get_pretty(err));
245 void qmp_xen_colo_do_checkpoint(Error **errp)
247 #ifdef CONFIG_REPLICATION
248 replication_do_checkpoint_all(errp);
254 COLOStatus *qmp_query_colo_status(Error **errp)
256 COLOStatus *s = g_new0(COLOStatus, 1);
258 s->mode = get_colo_mode();
260 switch (failover_get_state()) {
261 case FAILOVER_STATUS_NONE:
262 s->reason = COLO_EXIT_REASON_NONE;
264 case FAILOVER_STATUS_REQUIRE:
265 s->reason = COLO_EXIT_REASON_REQUEST;
268 s->reason = COLO_EXIT_REASON_ERROR;
274 static void colo_send_message(QEMUFile *f, COLOMessage msg,
279 if (msg >= COLO_MESSAGE__MAX) {
280 error_setg(errp, "%s: Invalid message", __func__);
283 qemu_put_be32(f, msg);
286 ret = qemu_file_get_error(f);
288 error_setg_errno(errp, -ret, "Can't send COLO message");
290 trace_colo_send_message(COLOMessage_str(msg));
293 static void colo_send_message_value(QEMUFile *f, COLOMessage msg,
294 uint64_t value, Error **errp)
296 Error *local_err = NULL;
299 colo_send_message(f, msg, &local_err);
301 error_propagate(errp, local_err);
304 qemu_put_be64(f, value);
307 ret = qemu_file_get_error(f);
309 error_setg_errno(errp, -ret, "Failed to send value for message:%s",
310 COLOMessage_str(msg));
314 static COLOMessage colo_receive_message(QEMUFile *f, Error **errp)
319 msg = qemu_get_be32(f);
320 ret = qemu_file_get_error(f);
322 error_setg_errno(errp, -ret, "Can't receive COLO message");
325 if (msg >= COLO_MESSAGE__MAX) {
326 error_setg(errp, "%s: Invalid message", __func__);
329 trace_colo_receive_message(COLOMessage_str(msg));
333 static void colo_receive_check_message(QEMUFile *f, COLOMessage expect_msg,
337 Error *local_err = NULL;
339 msg = colo_receive_message(f, &local_err);
341 error_propagate(errp, local_err);
344 if (msg != expect_msg) {
345 error_setg(errp, "Unexpected COLO message %d, expected %d",
350 static uint64_t colo_receive_message_value(QEMUFile *f, uint32_t expect_msg,
353 Error *local_err = NULL;
357 colo_receive_check_message(f, expect_msg, &local_err);
359 error_propagate(errp, local_err);
363 value = qemu_get_be64(f);
364 ret = qemu_file_get_error(f);
366 error_setg_errno(errp, -ret, "Failed to get value for COLO message: %s",
367 COLOMessage_str(expect_msg));
372 static int colo_do_checkpoint_transaction(MigrationState *s,
373 QIOChannelBuffer *bioc,
376 Error *local_err = NULL;
379 colo_send_message(s->to_dst_file, COLO_MESSAGE_CHECKPOINT_REQUEST,
385 colo_receive_check_message(s->rp_state.from_dst_file,
386 COLO_MESSAGE_CHECKPOINT_REPLY, &local_err);
390 /* Reset channel-buffer directly */
391 qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
394 qemu_mutex_lock_iothread();
395 if (failover_get_state() != FAILOVER_STATUS_NONE) {
396 qemu_mutex_unlock_iothread();
399 vm_stop_force_state(RUN_STATE_COLO);
400 qemu_mutex_unlock_iothread();
401 trace_colo_vm_state_change("run", "stop");
403 * Failover request bh could be called after vm_stop_force_state(),
404 * So we need check failover_request_is_active() again.
406 if (failover_get_state() != FAILOVER_STATUS_NONE) {
410 colo_notify_compares_event(NULL, COLO_EVENT_CHECKPOINT, &local_err);
415 /* Disable block migration */
416 migrate_set_block_enabled(false, &local_err);
417 qemu_mutex_lock_iothread();
418 replication_do_checkpoint_all(&local_err);
420 qemu_mutex_unlock_iothread();
424 colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
426 qemu_mutex_unlock_iothread();
429 /* Note: device state is saved into buffer */
430 ret = qemu_save_device_state(fb);
432 qemu_mutex_unlock_iothread();
437 * Only save VM's live state, which not including device state.
438 * TODO: We may need a timeout mechanism to prevent COLO process
439 * to be blocked here.
441 qemu_savevm_live_state(s->to_dst_file);
446 * We need the size of the VMstate data in Secondary side,
447 * With which we can decide how much data should be read.
449 colo_send_message_value(s->to_dst_file, COLO_MESSAGE_VMSTATE_SIZE,
450 bioc->usage, &local_err);
455 qemu_put_buffer(s->to_dst_file, bioc->data, bioc->usage);
456 qemu_fflush(s->to_dst_file);
457 ret = qemu_file_get_error(s->to_dst_file);
462 colo_receive_check_message(s->rp_state.from_dst_file,
463 COLO_MESSAGE_VMSTATE_RECEIVED, &local_err);
468 colo_receive_check_message(s->rp_state.from_dst_file,
469 COLO_MESSAGE_VMSTATE_LOADED, &local_err);
476 qemu_mutex_lock_iothread();
478 qemu_mutex_unlock_iothread();
479 trace_colo_vm_state_change("stop", "run");
483 error_report_err(local_err);
488 static void colo_compare_notify_checkpoint(Notifier *notifier, void *data)
490 colo_checkpoint_notify(data);
493 static void colo_process_checkpoint(MigrationState *s)
495 QIOChannelBuffer *bioc;
497 int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
498 Error *local_err = NULL;
501 failover_init_state();
503 s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
504 if (!s->rp_state.from_dst_file) {
505 error_report("Open QEMUFile from_dst_file failed");
509 packets_compare_notifier.notify = colo_compare_notify_checkpoint;
510 colo_compare_register_notifier(&packets_compare_notifier);
513 * Wait for Secondary finish loading VM states and enter COLO
516 colo_receive_check_message(s->rp_state.from_dst_file,
517 COLO_MESSAGE_CHECKPOINT_READY, &local_err);
521 bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
522 fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
523 object_unref(OBJECT(bioc));
525 qemu_mutex_lock_iothread();
526 replication_start_all(REPLICATION_MODE_PRIMARY, &local_err);
528 qemu_mutex_unlock_iothread();
533 qemu_mutex_unlock_iothread();
534 trace_colo_vm_state_change("stop", "run");
536 timer_mod(s->colo_delay_timer,
537 current_time + s->parameters.x_checkpoint_delay);
539 while (s->state == MIGRATION_STATUS_COLO) {
540 if (failover_get_state() != FAILOVER_STATUS_NONE) {
541 error_report("failover request");
545 qemu_sem_wait(&s->colo_checkpoint_sem);
547 if (s->state != MIGRATION_STATUS_COLO) {
550 ret = colo_do_checkpoint_transaction(s, bioc, fb);
557 /* Throw the unreported error message after exited from loop */
559 error_report_err(local_err);
567 * There are only two reasons we can get here, some error happened
568 * or the user triggered failover.
570 switch (failover_get_state()) {
571 case FAILOVER_STATUS_NONE:
572 qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
573 COLO_EXIT_REASON_ERROR);
575 case FAILOVER_STATUS_REQUIRE:
576 qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
577 COLO_EXIT_REASON_REQUEST);
583 /* Hope this not to be too long to wait here */
584 qemu_sem_wait(&s->colo_exit_sem);
585 qemu_sem_destroy(&s->colo_exit_sem);
588 * It is safe to unregister notifier after failover finished.
589 * Besides, colo_delay_timer and colo_checkpoint_sem can't be
590 * released befor unregister notifier, or there will be use-after-free
593 colo_compare_unregister_notifier(&packets_compare_notifier);
594 timer_del(s->colo_delay_timer);
595 timer_free(s->colo_delay_timer);
596 qemu_sem_destroy(&s->colo_checkpoint_sem);
599 * Must be called after failover BH is completed,
600 * Or the failover BH may shutdown the wrong fd that
601 * re-used by other threads after we release here.
603 if (s->rp_state.from_dst_file) {
604 qemu_fclose(s->rp_state.from_dst_file);
608 void colo_checkpoint_notify(void *opaque)
610 MigrationState *s = opaque;
611 int64_t next_notify_time;
613 qemu_sem_post(&s->colo_checkpoint_sem);
614 s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
615 next_notify_time = s->colo_checkpoint_time +
616 s->parameters.x_checkpoint_delay;
617 timer_mod(s->colo_delay_timer, next_notify_time);
620 void migrate_start_colo_process(MigrationState *s)
622 qemu_mutex_unlock_iothread();
623 qemu_sem_init(&s->colo_checkpoint_sem, 0);
624 s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST,
625 colo_checkpoint_notify, s);
627 qemu_sem_init(&s->colo_exit_sem, 0);
628 migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
629 MIGRATION_STATUS_COLO);
630 colo_process_checkpoint(s);
631 qemu_mutex_lock_iothread();
634 static void colo_wait_handle_message(QEMUFile *f, int *checkpoint_request,
638 Error *local_err = NULL;
640 msg = colo_receive_message(f, &local_err);
642 error_propagate(errp, local_err);
647 case COLO_MESSAGE_CHECKPOINT_REQUEST:
648 *checkpoint_request = 1;
651 *checkpoint_request = 0;
652 error_setg(errp, "Got unknown COLO message: %d", msg);
657 void *colo_process_incoming_thread(void *opaque)
659 MigrationIncomingState *mis = opaque;
661 QIOChannelBuffer *bioc = NULL; /* Cache incoming device state */
664 Error *local_err = NULL;
667 rcu_register_thread();
668 qemu_sem_init(&mis->colo_incoming_sem, 0);
670 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
671 MIGRATION_STATUS_COLO);
673 failover_init_state();
675 mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
676 if (!mis->to_src_file) {
677 error_report("COLO incoming thread: Open QEMUFile to_src_file failed");
681 * Note: the communication between Primary side and Secondary side
682 * should be sequential, we set the fd to unblocked in migration incoming
683 * coroutine, and here we are in the COLO incoming thread, so it is ok to
684 * set the fd back to blocked.
686 qemu_file_set_blocking(mis->from_src_file, true);
688 bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
689 fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
690 object_unref(OBJECT(bioc));
692 qemu_mutex_lock_iothread();
693 replication_start_all(REPLICATION_MODE_SECONDARY, &local_err);
695 qemu_mutex_unlock_iothread();
699 trace_colo_vm_state_change("stop", "run");
700 qemu_mutex_unlock_iothread();
702 colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
708 while (mis->state == MIGRATION_STATUS_COLO) {
711 colo_wait_handle_message(mis->from_src_file, &request, &local_err);
716 if (failover_get_state() != FAILOVER_STATUS_NONE) {
717 error_report("failover request");
721 qemu_mutex_lock_iothread();
722 vm_stop_force_state(RUN_STATE_COLO);
723 trace_colo_vm_state_change("run", "stop");
724 qemu_mutex_unlock_iothread();
726 /* FIXME: This is unnecessary for periodic checkpoint mode */
727 colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
733 colo_receive_check_message(mis->from_src_file,
734 COLO_MESSAGE_VMSTATE_SEND, &local_err);
739 qemu_mutex_lock_iothread();
740 cpu_synchronize_all_pre_loadvm();
741 ret = qemu_loadvm_state_main(mis->from_src_file, mis);
742 qemu_mutex_unlock_iothread();
745 error_report("Load VM's live state (ram) error");
749 value = colo_receive_message_value(mis->from_src_file,
750 COLO_MESSAGE_VMSTATE_SIZE, &local_err);
756 * Read VM device state data into channel buffer,
757 * It's better to re-use the memory allocated.
758 * Here we need to handle the channel buffer directly.
760 if (value > bioc->capacity) {
761 bioc->capacity = value;
762 bioc->data = g_realloc(bioc->data, bioc->capacity);
764 total_size = qemu_get_buffer(mis->from_src_file, bioc->data, value);
765 if (total_size != value) {
766 error_report("Got %" PRIu64 " VMState data, less than expected"
767 " %" PRIu64, total_size, value);
770 bioc->usage = total_size;
771 qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
773 colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_RECEIVED,
779 qemu_mutex_lock_iothread();
780 vmstate_loading = true;
781 ret = qemu_load_device_state(fb);
783 error_report("COLO: load device state failed");
784 qemu_mutex_unlock_iothread();
788 replication_get_error_all(&local_err);
790 qemu_mutex_unlock_iothread();
793 /* discard colo disk buffer */
794 replication_do_checkpoint_all(&local_err);
796 qemu_mutex_unlock_iothread();
800 /* Notify all filters of all NIC to do checkpoint */
801 colo_notify_filters_event(COLO_EVENT_CHECKPOINT, &local_err);
804 qemu_mutex_unlock_iothread();
808 vmstate_loading = false;
810 trace_colo_vm_state_change("stop", "run");
811 qemu_mutex_unlock_iothread();
813 if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
814 failover_set_state(FAILOVER_STATUS_RELAUNCH,
815 FAILOVER_STATUS_NONE);
816 failover_request_active(NULL);
820 colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED,
828 vmstate_loading = false;
829 /* Throw the unreported error message after exited from loop */
831 error_report_err(local_err);
834 switch (failover_get_state()) {
835 case FAILOVER_STATUS_NONE:
836 qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
837 COLO_EXIT_REASON_ERROR);
839 case FAILOVER_STATUS_REQUIRE:
840 qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
841 COLO_EXIT_REASON_REQUEST);
851 /* Hope this not to be too long to loop here */
852 qemu_sem_wait(&mis->colo_incoming_sem);
853 qemu_sem_destroy(&mis->colo_incoming_sem);
854 /* Must be called after failover BH is completed */
855 if (mis->to_src_file) {
856 qemu_fclose(mis->to_src_file);
858 migration_incoming_disable_colo();
860 rcu_unregister_thread();