migration/colo.c

   1 /*
   2  * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
   3  * (a.k.a. Fault Tolerance or Continuous Replication)
   4  *
   5  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
   6  * Copyright (c) 2016 FUJITSU LIMITED
   7  * Copyright (c) 2016 Intel Corporation
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2 or
  10  * later.  See the COPYING file in the top-level directory.
  11  */
  12
  13 #include "qemu/osdep.h"
  14 #include "sysemu/sysemu.h"
  15 #include "qemu-file-channel.h"
  16 #include "migration.h"
  17 #include "qemu-file.h"
  18 #include "savevm.h"
  19 #include "migration/colo.h"
  20 #include "block.h"
  21 #include "io/channel-buffer.h"
  22 #include "trace.h"
  23 #include "qemu/error-report.h"
  24 #include "migration/failover.h"
  25 #include "replication.h"
  26 #include "qmp-commands.h"
  27
  28 static bool vmstate_loading;
  29
  30 #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
  31
  32 bool migration_in_colo_state(void)
  33 {
  34     MigrationState *s = migrate_get_current();
  35
  36     return (s->state == MIGRATION_STATUS_COLO);
  37 }
  38
  39 bool migration_incoming_in_colo_state(void)
  40 {
  41     MigrationIncomingState *mis = migration_incoming_get_current();
  42
  43     return mis && (mis->state == MIGRATION_STATUS_COLO);
  44 }
  45
  46 static bool colo_runstate_is_stopped(void)
  47 {
  48     return runstate_check(RUN_STATE_COLO) || !runstate_is_running();
  49 }
  50
  51 static void secondary_vm_do_failover(void)
  52 {
  53     int old_state;
  54     MigrationIncomingState *mis = migration_incoming_get_current();
  55
  56     /* Can not do failover during the process of VM's loading VMstate, Or
  57      * it will break the secondary VM.
  58      */
  59     if (vmstate_loading) {
  60         old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
  61                         FAILOVER_STATUS_RELAUNCH);
  62         if (old_state != FAILOVER_STATUS_ACTIVE) {
  63             error_report("Unknown error while do failover for secondary VM,"
  64                          "old_state: %s", FailoverStatus_str(old_state));
  65         }
  66         return;
  67     }
  68
  69     migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
  70                       MIGRATION_STATUS_COMPLETED);
  71
  72     if (!autostart) {
  73         error_report("\"-S\" qemu option will be ignored in secondary side");
  74         /* recover runstate to normal migration finish state */
  75         autostart = true;
  76     }
  77     /*
  78      * Make sure COLO incoming thread not block in recv or send,
  79      * If mis->from_src_file and mis->to_src_file use the same fd,
  80      * The second shutdown() will return -1, we ignore this value,
  81      * It is harmless.
  82      */
  83     if (mis->from_src_file) {
  84         qemu_file_shutdown(mis->from_src_file);
  85     }
  86     if (mis->to_src_file) {
  87         qemu_file_shutdown(mis->to_src_file);
  88     }
  89
  90     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
  91                                    FAILOVER_STATUS_COMPLETED);
  92     if (old_state != FAILOVER_STATUS_ACTIVE) {
  93         error_report("Incorrect state (%s) while doing failover for "
  94                      "secondary VM", FailoverStatus_str(old_state));
  95         return;
  96     }
  97     /* Notify COLO incoming thread that failover work is finished */
  98     qemu_sem_post(&mis->colo_incoming_sem);
  99     /* For Secondary VM, jump to incoming co */
 100     if (mis->migration_incoming_co) {
 101         qemu_coroutine_enter(mis->migration_incoming_co);
 102     }
 103 }
 104
 105 static void primary_vm_do_failover(void)
 106 {
 107     MigrationState *s = migrate_get_current();
 108     int old_state;
 109
 110     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
 111                       MIGRATION_STATUS_COMPLETED);
 112
 113     /*
 114      * Wake up COLO thread which may blocked in recv() or send(),
 115      * The s->rp_state.from_dst_file and s->to_dst_file may use the
 116      * same fd, but we still shutdown the fd for twice, it is harmless.
 117      */
 118     if (s->to_dst_file) {
 119         qemu_file_shutdown(s->to_dst_file);
 120     }
 121     if (s->rp_state.from_dst_file) {
 122         qemu_file_shutdown(s->rp_state.from_dst_file);
 123     }
 124
 125     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
 126                                    FAILOVER_STATUS_COMPLETED);
 127     if (old_state != FAILOVER_STATUS_ACTIVE) {
 128         error_report("Incorrect state (%s) while doing failover for Primary VM",
 129                      FailoverStatus_str(old_state));
 130         return;
 131     }
 132     /* Notify COLO thread that failover work is finished */
 133     qemu_sem_post(&s->colo_exit_sem);
 134 }
 135
 136 void colo_do_failover(MigrationState *s)
 137 {
 138     /* Make sure VM stopped while failover happened. */
 139     if (!colo_runstate_is_stopped()) {
 140         vm_stop_force_state(RUN_STATE_COLO);
 141     }
 142
 143     if (get_colo_mode() == COLO_MODE_PRIMARY) {
 144         primary_vm_do_failover();
 145     } else {
 146         secondary_vm_do_failover();
 147     }
 148 }
 149
 150 void qmp_xen_set_replication(bool enable, bool primary,
 151                              bool has_failover, bool failover,
 152                              Error **errp)
 153 {
 154 #ifdef CONFIG_REPLICATION
 155     ReplicationMode mode = primary ?
 156                            REPLICATION_MODE_PRIMARY :
 157                            REPLICATION_MODE_SECONDARY;
 158
 159     if (has_failover && enable) {
 160         error_setg(errp, "Parameter 'failover' is only for"
 161                    " stopping replication");
 162         return;
 163     }
 164
 165     if (enable) {
 166         replication_start_all(mode, errp);
 167     } else {
 168         if (!has_failover) {
 169             failover = NULL;
 170         }
 171         replication_stop_all(failover, failover ? NULL : errp);
 172     }
 173 #else
 174     abort();
 175 #endif
 176 }
 177
 178 ReplicationStatus *qmp_query_xen_replication_status(Error **errp)
 179 {
 180 #ifdef CONFIG_REPLICATION
 181     Error *err = NULL;
 182     ReplicationStatus *s = g_new0(ReplicationStatus, 1);
 183
 184     replication_get_error_all(&err);
 185     if (err) {
 186         s->error = true;
 187         s->has_desc = true;
 188         s->desc = g_strdup(error_get_pretty(err));
 189     } else {
 190         s->error = false;
 191     }
 192
 193     error_free(err);
 194     return s;
 195 #else
 196     abort();
 197 #endif
 198 }
 199
 200 void qmp_xen_colo_do_checkpoint(Error **errp)
 201 {
 202 #ifdef CONFIG_REPLICATION
 203     replication_do_checkpoint_all(errp);
 204 #else
 205     abort();
 206 #endif
 207 }
 208
 209 static void colo_send_message(QEMUFile *f, COLOMessage msg,
 210                               Error **errp)
 211 {
 212     int ret;
 213
 214     if (msg >= COLO_MESSAGE__MAX) {
 215         error_setg(errp, "%s: Invalid message", __func__);
 216         return;
 217     }
 218     qemu_put_be32(f, msg);
 219     qemu_fflush(f);
 220
 221     ret = qemu_file_get_error(f);
 222     if (ret < 0) {
 223         error_setg_errno(errp, -ret, "Can't send COLO message");
 224     }
 225     trace_colo_send_message(COLOMessage_str(msg));
 226 }
 227
 228 static void colo_send_message_value(QEMUFile *f, COLOMessage msg,
 229                                     uint64_t value, Error **errp)
 230 {
 231     Error *local_err = NULL;
 232     int ret;
 233
 234     colo_send_message(f, msg, &local_err);
 235     if (local_err) {
 236         error_propagate(errp, local_err);
 237         return;
 238     }
 239     qemu_put_be64(f, value);
 240     qemu_fflush(f);
 241
 242     ret = qemu_file_get_error(f);
 243     if (ret < 0) {
 244         error_setg_errno(errp, -ret, "Failed to send value for message:%s",
 245                          COLOMessage_str(msg));
 246     }
 247 }
 248
 249 static COLOMessage colo_receive_message(QEMUFile *f, Error **errp)
 250 {
 251     COLOMessage msg;
 252     int ret;
 253
 254     msg = qemu_get_be32(f);
 255     ret = qemu_file_get_error(f);
 256     if (ret < 0) {
 257         error_setg_errno(errp, -ret, "Can't receive COLO message");
 258         return msg;
 259     }
 260     if (msg >= COLO_MESSAGE__MAX) {
 261         error_setg(errp, "%s: Invalid message", __func__);
 262         return msg;
 263     }
 264     trace_colo_receive_message(COLOMessage_str(msg));
 265     return msg;
 266 }
 267
 268 static void colo_receive_check_message(QEMUFile *f, COLOMessage expect_msg,
 269                                        Error **errp)
 270 {
 271     COLOMessage msg;
 272     Error *local_err = NULL;
 273
 274     msg = colo_receive_message(f, &local_err);
 275     if (local_err) {
 276         error_propagate(errp, local_err);
 277         return;
 278     }
 279     if (msg != expect_msg) {
 280         error_setg(errp, "Unexpected COLO message %d, expected %d",
 281                           msg, expect_msg);
 282     }
 283 }
 284
 285 static uint64_t colo_receive_message_value(QEMUFile *f, uint32_t expect_msg,
 286                                            Error **errp)
 287 {
 288     Error *local_err = NULL;
 289     uint64_t value;
 290     int ret;
 291
 292     colo_receive_check_message(f, expect_msg, &local_err);
 293     if (local_err) {
 294         error_propagate(errp, local_err);
 295         return 0;
 296     }
 297
 298     value = qemu_get_be64(f);
 299     ret = qemu_file_get_error(f);
 300     if (ret < 0) {
 301         error_setg_errno(errp, -ret, "Failed to get value for COLO message: %s",
 302                          COLOMessage_str(expect_msg));
 303     }
 304     return value;
 305 }
 306
 307 static int colo_do_checkpoint_transaction(MigrationState *s,
 308                                           QIOChannelBuffer *bioc,
 309                                           QEMUFile *fb)
 310 {
 311     Error *local_err = NULL;
 312     int ret = -1;
 313
 314     colo_send_message(s->to_dst_file, COLO_MESSAGE_CHECKPOINT_REQUEST,
 315                       &local_err);
 316     if (local_err) {
 317         goto out;
 318     }
 319
 320     colo_receive_check_message(s->rp_state.from_dst_file,
 321                     COLO_MESSAGE_CHECKPOINT_REPLY, &local_err);
 322     if (local_err) {
 323         goto out;
 324     }
 325     /* Reset channel-buffer directly */
 326     qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
 327     bioc->usage = 0;
 328
 329     qemu_mutex_lock_iothread();
 330     if (failover_get_state() != FAILOVER_STATUS_NONE) {
 331         qemu_mutex_unlock_iothread();
 332         goto out;
 333     }
 334     vm_stop_force_state(RUN_STATE_COLO);
 335     qemu_mutex_unlock_iothread();
 336     trace_colo_vm_state_change("run", "stop");
 337     /*
 338      * Failover request bh could be called after vm_stop_force_state(),
 339      * So we need check failover_request_is_active() again.
 340      */
 341     if (failover_get_state() != FAILOVER_STATUS_NONE) {
 342         goto out;
 343     }
 344
 345     /* Disable block migration */
 346     migrate_set_block_enabled(false, &local_err);
 347     qemu_savevm_state_header(fb);
 348     qemu_savevm_state_setup(fb);
 349     qemu_mutex_lock_iothread();
 350     qemu_savevm_state_complete_precopy(fb, false, false);
 351     qemu_mutex_unlock_iothread();
 352
 353     qemu_fflush(fb);
 354
 355     colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
 356     if (local_err) {
 357         goto out;
 358     }
 359     /*
 360      * We need the size of the VMstate data in Secondary side,
 361      * With which we can decide how much data should be read.
 362      */
 363     colo_send_message_value(s->to_dst_file, COLO_MESSAGE_VMSTATE_SIZE,
 364                             bioc->usage, &local_err);
 365     if (local_err) {
 366         goto out;
 367     }
 368
 369     qemu_put_buffer(s->to_dst_file, bioc->data, bioc->usage);
 370     qemu_fflush(s->to_dst_file);
 371     ret = qemu_file_get_error(s->to_dst_file);
 372     if (ret < 0) {
 373         goto out;
 374     }
 375
 376     colo_receive_check_message(s->rp_state.from_dst_file,
 377                        COLO_MESSAGE_VMSTATE_RECEIVED, &local_err);
 378     if (local_err) {
 379         goto out;
 380     }
 381
 382     colo_receive_check_message(s->rp_state.from_dst_file,
 383                        COLO_MESSAGE_VMSTATE_LOADED, &local_err);
 384     if (local_err) {
 385         goto out;
 386     }
 387
 388     ret = 0;
 389
 390     qemu_mutex_lock_iothread();
 391     vm_start();
 392     qemu_mutex_unlock_iothread();
 393     trace_colo_vm_state_change("stop", "run");
 394
 395 out:
 396     if (local_err) {
 397         error_report_err(local_err);
 398     }
 399     return ret;
 400 }
 401
 402 static void colo_process_checkpoint(MigrationState *s)
 403 {
 404     QIOChannelBuffer *bioc;
 405     QEMUFile *fb = NULL;
 406     int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 407     Error *local_err = NULL;
 408     int ret;
 409
 410     failover_init_state();
 411
 412     s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
 413     if (!s->rp_state.from_dst_file) {
 414         error_report("Open QEMUFile from_dst_file failed");
 415         goto out;
 416     }
 417
 418     /*
 419      * Wait for Secondary finish loading VM states and enter COLO
 420      * restore.
 421      */
 422     colo_receive_check_message(s->rp_state.from_dst_file,
 423                        COLO_MESSAGE_CHECKPOINT_READY, &local_err);
 424     if (local_err) {
 425         goto out;
 426     }
 427     bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
 428     fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
 429     object_unref(OBJECT(bioc));
 430
 431     qemu_mutex_lock_iothread();
 432     vm_start();
 433     qemu_mutex_unlock_iothread();
 434     trace_colo_vm_state_change("stop", "run");
 435
 436     timer_mod(s->colo_delay_timer,
 437             current_time + s->parameters.x_checkpoint_delay);
 438
 439     while (s->state == MIGRATION_STATUS_COLO) {
 440         if (failover_get_state() != FAILOVER_STATUS_NONE) {
 441             error_report("failover request");
 442             goto out;
 443         }
 444
 445         qemu_sem_wait(&s->colo_checkpoint_sem);
 446
 447         ret = colo_do_checkpoint_transaction(s, bioc, fb);
 448         if (ret < 0) {
 449             goto out;
 450         }
 451     }
 452
 453 out:
 454     /* Throw the unreported error message after exited from loop */
 455     if (local_err) {
 456         error_report_err(local_err);
 457     }
 458
 459     if (fb) {
 460         qemu_fclose(fb);
 461     }
 462
 463     timer_del(s->colo_delay_timer);
 464
 465     /* Hope this not to be too long to wait here */
 466     qemu_sem_wait(&s->colo_exit_sem);
 467     qemu_sem_destroy(&s->colo_exit_sem);
 468     /*
 469      * Must be called after failover BH is completed,
 470      * Or the failover BH may shutdown the wrong fd that
 471      * re-used by other threads after we release here.
 472      */
 473     if (s->rp_state.from_dst_file) {
 474         qemu_fclose(s->rp_state.from_dst_file);
 475     }
 476 }
 477
 478 void colo_checkpoint_notify(void *opaque)
 479 {
 480     MigrationState *s = opaque;
 481     int64_t next_notify_time;
 482
 483     qemu_sem_post(&s->colo_checkpoint_sem);
 484     s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 485     next_notify_time = s->colo_checkpoint_time +
 486                     s->parameters.x_checkpoint_delay;
 487     timer_mod(s->colo_delay_timer, next_notify_time);
 488 }
 489
 490 void migrate_start_colo_process(MigrationState *s)
 491 {
 492     qemu_mutex_unlock_iothread();
 493     qemu_sem_init(&s->colo_checkpoint_sem, 0);
 494     s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
 495                                 colo_checkpoint_notify, s);
 496
 497     qemu_sem_init(&s->colo_exit_sem, 0);
 498     migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
 499                       MIGRATION_STATUS_COLO);
 500     colo_process_checkpoint(s);
 501     qemu_mutex_lock_iothread();
 502 }
 503
 504 static void colo_wait_handle_message(QEMUFile *f, int *checkpoint_request,
 505                                      Error **errp)
 506 {
 507     COLOMessage msg;
 508     Error *local_err = NULL;
 509
 510     msg = colo_receive_message(f, &local_err);
 511     if (local_err) {
 512         error_propagate(errp, local_err);
 513         return;
 514     }
 515
 516     switch (msg) {
 517     case COLO_MESSAGE_CHECKPOINT_REQUEST:
 518         *checkpoint_request = 1;
 519         break;
 520     default:
 521         *checkpoint_request = 0;
 522         error_setg(errp, "Got unknown COLO message: %d", msg);
 523         break;
 524     }
 525 }
 526
 527 void *colo_process_incoming_thread(void *opaque)
 528 {
 529     MigrationIncomingState *mis = opaque;
 530     QEMUFile *fb = NULL;
 531     QIOChannelBuffer *bioc = NULL; /* Cache incoming device state */
 532     uint64_t total_size;
 533     uint64_t value;
 534     Error *local_err = NULL;
 535
 536     qemu_sem_init(&mis->colo_incoming_sem, 0);
 537
 538     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 539                       MIGRATION_STATUS_COLO);
 540
 541     failover_init_state();
 542
 543     mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
 544     if (!mis->to_src_file) {
 545         error_report("COLO incoming thread: Open QEMUFile to_src_file failed");
 546         goto out;
 547     }
 548     /*
 549      * Note: the communication between Primary side and Secondary side
 550      * should be sequential, we set the fd to unblocked in migration incoming
 551      * coroutine, and here we are in the COLO incoming thread, so it is ok to
 552      * set the fd back to blocked.
 553      */
 554     qemu_file_set_blocking(mis->from_src_file, true);
 555
 556     bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
 557     fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
 558     object_unref(OBJECT(bioc));
 559
 560     colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
 561                       &local_err);
 562     if (local_err) {
 563         goto out;
 564     }
 565
 566     while (mis->state == MIGRATION_STATUS_COLO) {
 567         int request = 0;
 568
 569         colo_wait_handle_message(mis->from_src_file, &request, &local_err);
 570         if (local_err) {
 571             goto out;
 572         }
 573         assert(request);
 574         if (failover_get_state() != FAILOVER_STATUS_NONE) {
 575             error_report("failover request");
 576             goto out;
 577         }
 578
 579         /* FIXME: This is unnecessary for periodic checkpoint mode */
 580         colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
 581                      &local_err);
 582         if (local_err) {
 583             goto out;
 584         }
 585
 586         colo_receive_check_message(mis->from_src_file,
 587                            COLO_MESSAGE_VMSTATE_SEND, &local_err);
 588         if (local_err) {
 589             goto out;
 590         }
 591
 592         value = colo_receive_message_value(mis->from_src_file,
 593                                  COLO_MESSAGE_VMSTATE_SIZE, &local_err);
 594         if (local_err) {
 595             goto out;
 596         }
 597
 598         /*
 599          * Read VM device state data into channel buffer,
 600          * It's better to re-use the memory allocated.
 601          * Here we need to handle the channel buffer directly.
 602          */
 603         if (value > bioc->capacity) {
 604             bioc->capacity = value;
 605             bioc->data = g_realloc(bioc->data, bioc->capacity);
 606         }
 607         total_size = qemu_get_buffer(mis->from_src_file, bioc->data, value);
 608         if (total_size != value) {
 609             error_report("Got %" PRIu64 " VMState data, less than expected"
 610                         " %" PRIu64, total_size, value);
 611             goto out;
 612         }
 613         bioc->usage = total_size;
 614         qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
 615
 616         colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_RECEIVED,
 617                      &local_err);
 618         if (local_err) {
 619             goto out;
 620         }
 621
 622         qemu_mutex_lock_iothread();
 623         qemu_system_reset(SHUTDOWN_CAUSE_NONE);
 624         vmstate_loading = true;
 625         if (qemu_loadvm_state(fb) < 0) {
 626             error_report("COLO: loadvm failed");
 627             qemu_mutex_unlock_iothread();
 628             goto out;
 629         }
 630
 631         vmstate_loading = false;
 632         qemu_mutex_unlock_iothread();
 633
 634         if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
 635             failover_set_state(FAILOVER_STATUS_RELAUNCH,
 636                             FAILOVER_STATUS_NONE);
 637             failover_request_active(NULL);
 638             goto out;
 639         }
 640
 641         colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED,
 642                      &local_err);
 643         if (local_err) {
 644             goto out;
 645         }
 646     }
 647
 648 out:
 649     vmstate_loading = false;
 650     /* Throw the unreported error message after exited from loop */
 651     if (local_err) {
 652         error_report_err(local_err);
 653     }
 654
 655     if (fb) {
 656         qemu_fclose(fb);
 657     }
 658
 659     /* Hope this not to be too long to loop here */
 660     qemu_sem_wait(&mis->colo_incoming_sem);
 661     qemu_sem_destroy(&mis->colo_incoming_sem);
 662     /* Must be called after failover BH is completed */
 663     if (mis->to_src_file) {
 664         qemu_fclose(mis->to_src_file);
 665     }
 666     migration_incoming_exit_colo();
 667
 668     return NULL;
 669 }