migration/colo.c

   1 /*
   2  * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
   3  * (a.k.a. Fault Tolerance or Continuous Replication)
   4  *
   5  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
   6  * Copyright (c) 2016 FUJITSU LIMITED
   7  * Copyright (c) 2016 Intel Corporation
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2 or
  10  * later.  See the COPYING file in the top-level directory.
  11  */
  12
  13 #include "qemu/osdep.h"
  14 #include "qemu/timer.h"
  15 #include "sysemu/sysemu.h"
  16 #include "migration/colo.h"
  17 #include "io/channel-buffer.h"
  18 #include "trace.h"
  19 #include "qemu/error-report.h"
  20 #include "qapi/error.h"
  21 #include "migration/failover.h"
  22 #include "replication.h"
  23 #include "qmp-commands.h"
  24
  25 static bool vmstate_loading;
  26
  27 #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
  28
  29 bool colo_supported(void)
  30 {
  31     return true;
  32 }
  33
  34 bool migration_in_colo_state(void)
  35 {
  36     MigrationState *s = migrate_get_current();
  37
  38     return (s->state == MIGRATION_STATUS_COLO);
  39 }
  40
  41 bool migration_incoming_in_colo_state(void)
  42 {
  43     MigrationIncomingState *mis = migration_incoming_get_current();
  44
  45     return mis && (mis->state == MIGRATION_STATUS_COLO);
  46 }
  47
  48 static bool colo_runstate_is_stopped(void)
  49 {
  50     return runstate_check(RUN_STATE_COLO) || !runstate_is_running();
  51 }
  52
  53 static void secondary_vm_do_failover(void)
  54 {
  55     int old_state;
  56     MigrationIncomingState *mis = migration_incoming_get_current();
  57
  58     /* Can not do failover during the process of VM's loading VMstate, Or
  59      * it will break the secondary VM.
  60      */
  61     if (vmstate_loading) {
  62         old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
  63                         FAILOVER_STATUS_RELAUNCH);
  64         if (old_state != FAILOVER_STATUS_ACTIVE) {
  65             error_report("Unknown error while do failover for secondary VM,"
  66                          "old_state: %s", FailoverStatus_lookup[old_state]);
  67         }
  68         return;
  69     }
  70
  71     migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
  72                       MIGRATION_STATUS_COMPLETED);
  73
  74     if (!autostart) {
  75         error_report("\"-S\" qemu option will be ignored in secondary side");
  76         /* recover runstate to normal migration finish state */
  77         autostart = true;
  78     }
  79     /*
  80      * Make sure COLO incoming thread not block in recv or send,
  81      * If mis->from_src_file and mis->to_src_file use the same fd,
  82      * The second shutdown() will return -1, we ignore this value,
  83      * It is harmless.
  84      */
  85     if (mis->from_src_file) {
  86         qemu_file_shutdown(mis->from_src_file);
  87     }
  88     if (mis->to_src_file) {
  89         qemu_file_shutdown(mis->to_src_file);
  90     }
  91
  92     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
  93                                    FAILOVER_STATUS_COMPLETED);
  94     if (old_state != FAILOVER_STATUS_ACTIVE) {
  95         error_report("Incorrect state (%s) while doing failover for "
  96                      "secondary VM", FailoverStatus_lookup[old_state]);
  97         return;
  98     }
  99     /* Notify COLO incoming thread that failover work is finished */
 100     qemu_sem_post(&mis->colo_incoming_sem);
 101     /* For Secondary VM, jump to incoming co */
 102     if (mis->migration_incoming_co) {
 103         qemu_coroutine_enter(mis->migration_incoming_co);
 104     }
 105 }
 106
 107 static void primary_vm_do_failover(void)
 108 {
 109     MigrationState *s = migrate_get_current();
 110     int old_state;
 111
 112     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
 113                       MIGRATION_STATUS_COMPLETED);
 114
 115     /*
 116      * Wake up COLO thread which may blocked in recv() or send(),
 117      * The s->rp_state.from_dst_file and s->to_dst_file may use the
 118      * same fd, but we still shutdown the fd for twice, it is harmless.
 119      */
 120     if (s->to_dst_file) {
 121         qemu_file_shutdown(s->to_dst_file);
 122     }
 123     if (s->rp_state.from_dst_file) {
 124         qemu_file_shutdown(s->rp_state.from_dst_file);
 125     }
 126
 127     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
 128                                    FAILOVER_STATUS_COMPLETED);
 129     if (old_state != FAILOVER_STATUS_ACTIVE) {
 130         error_report("Incorrect state (%s) while doing failover for Primary VM",
 131                      FailoverStatus_lookup[old_state]);
 132         return;
 133     }
 134     /* Notify COLO thread that failover work is finished */
 135     qemu_sem_post(&s->colo_exit_sem);
 136 }
 137
 138 void colo_do_failover(MigrationState *s)
 139 {
 140     /* Make sure VM stopped while failover happened. */
 141     if (!colo_runstate_is_stopped()) {
 142         vm_stop_force_state(RUN_STATE_COLO);
 143     }
 144
 145     if (get_colo_mode() == COLO_MODE_PRIMARY) {
 146         primary_vm_do_failover();
 147     } else {
 148         secondary_vm_do_failover();
 149     }
 150 }
 151
 152 void qmp_xen_set_replication(bool enable, bool primary,
 153                              bool has_failover, bool failover,
 154                              Error **errp)
 155 {
 156     ReplicationMode mode = primary ?
 157                            REPLICATION_MODE_PRIMARY :
 158                            REPLICATION_MODE_SECONDARY;
 159
 160     if (has_failover && enable) {
 161         error_setg(errp, "Parameter 'failover' is only for"
 162                    " stopping replication");
 163         return;
 164     }
 165
 166     if (enable) {
 167         replication_start_all(mode, errp);
 168     } else {
 169         if (!has_failover) {
 170             failover = NULL;
 171         }
 172         replication_stop_all(failover, failover ? NULL : errp);
 173     }
 174 }
 175
 176 ReplicationStatus *qmp_query_xen_replication_status(Error **errp)
 177 {
 178     Error *err = NULL;
 179     ReplicationStatus *s = g_new0(ReplicationStatus, 1);
 180
 181     replication_get_error_all(&err);
 182     if (err) {
 183         s->error = true;
 184         s->has_desc = true;
 185         s->desc = g_strdup(error_get_pretty(err));
 186     } else {
 187         s->error = false;
 188     }
 189
 190     error_free(err);
 191     return s;
 192 }
 193
 194 void qmp_xen_colo_do_checkpoint(Error **errp)
 195 {
 196     replication_do_checkpoint_all(errp);
 197 }
 198
 199 static void colo_send_message(QEMUFile *f, COLOMessage msg,
 200                               Error **errp)
 201 {
 202     int ret;
 203
 204     if (msg >= COLO_MESSAGE__MAX) {
 205         error_setg(errp, "%s: Invalid message", __func__);
 206         return;
 207     }
 208     qemu_put_be32(f, msg);
 209     qemu_fflush(f);
 210
 211     ret = qemu_file_get_error(f);
 212     if (ret < 0) {
 213         error_setg_errno(errp, -ret, "Can't send COLO message");
 214     }
 215     trace_colo_send_message(COLOMessage_lookup[msg]);
 216 }
 217
 218 static void colo_send_message_value(QEMUFile *f, COLOMessage msg,
 219                                     uint64_t value, Error **errp)
 220 {
 221     Error *local_err = NULL;
 222     int ret;
 223
 224     colo_send_message(f, msg, &local_err);
 225     if (local_err) {
 226         error_propagate(errp, local_err);
 227         return;
 228     }
 229     qemu_put_be64(f, value);
 230     qemu_fflush(f);
 231
 232     ret = qemu_file_get_error(f);
 233     if (ret < 0) {
 234         error_setg_errno(errp, -ret, "Failed to send value for message:%s",
 235                          COLOMessage_lookup[msg]);
 236     }
 237 }
 238
 239 static COLOMessage colo_receive_message(QEMUFile *f, Error **errp)
 240 {
 241     COLOMessage msg;
 242     int ret;
 243
 244     msg = qemu_get_be32(f);
 245     ret = qemu_file_get_error(f);
 246     if (ret < 0) {
 247         error_setg_errno(errp, -ret, "Can't receive COLO message");
 248         return msg;
 249     }
 250     if (msg >= COLO_MESSAGE__MAX) {
 251         error_setg(errp, "%s: Invalid message", __func__);
 252         return msg;
 253     }
 254     trace_colo_receive_message(COLOMessage_lookup[msg]);
 255     return msg;
 256 }
 257
 258 static void colo_receive_check_message(QEMUFile *f, COLOMessage expect_msg,
 259                                        Error **errp)
 260 {
 261     COLOMessage msg;
 262     Error *local_err = NULL;
 263
 264     msg = colo_receive_message(f, &local_err);
 265     if (local_err) {
 266         error_propagate(errp, local_err);
 267         return;
 268     }
 269     if (msg != expect_msg) {
 270         error_setg(errp, "Unexpected COLO message %d, expected %d",
 271                           msg, expect_msg);
 272     }
 273 }
 274
 275 static uint64_t colo_receive_message_value(QEMUFile *f, uint32_t expect_msg,
 276                                            Error **errp)
 277 {
 278     Error *local_err = NULL;
 279     uint64_t value;
 280     int ret;
 281
 282     colo_receive_check_message(f, expect_msg, &local_err);
 283     if (local_err) {
 284         error_propagate(errp, local_err);
 285         return 0;
 286     }
 287
 288     value = qemu_get_be64(f);
 289     ret = qemu_file_get_error(f);
 290     if (ret < 0) {
 291         error_setg_errno(errp, -ret, "Failed to get value for COLO message: %s",
 292                          COLOMessage_lookup[expect_msg]);
 293     }
 294     return value;
 295 }
 296
 297 static int colo_do_checkpoint_transaction(MigrationState *s,
 298                                           QIOChannelBuffer *bioc,
 299                                           QEMUFile *fb)
 300 {
 301     Error *local_err = NULL;
 302     int ret = -1;
 303
 304     colo_send_message(s->to_dst_file, COLO_MESSAGE_CHECKPOINT_REQUEST,
 305                       &local_err);
 306     if (local_err) {
 307         goto out;
 308     }
 309
 310     colo_receive_check_message(s->rp_state.from_dst_file,
 311                     COLO_MESSAGE_CHECKPOINT_REPLY, &local_err);
 312     if (local_err) {
 313         goto out;
 314     }
 315     /* Reset channel-buffer directly */
 316     qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
 317     bioc->usage = 0;
 318
 319     qemu_mutex_lock_iothread();
 320     if (failover_get_state() != FAILOVER_STATUS_NONE) {
 321         qemu_mutex_unlock_iothread();
 322         goto out;
 323     }
 324     vm_stop_force_state(RUN_STATE_COLO);
 325     qemu_mutex_unlock_iothread();
 326     trace_colo_vm_state_change("run", "stop");
 327     /*
 328      * Failover request bh could be called after vm_stop_force_state(),
 329      * So we need check failover_request_is_active() again.
 330      */
 331     if (failover_get_state() != FAILOVER_STATUS_NONE) {
 332         goto out;
 333     }
 334
 335     /* Disable block migration */
 336     s->params.blk = 0;
 337     s->params.shared = 0;
 338     qemu_savevm_state_header(fb);
 339     qemu_savevm_state_begin(fb, &s->params);
 340     qemu_mutex_lock_iothread();
 341     qemu_savevm_state_complete_precopy(fb, false);
 342     qemu_mutex_unlock_iothread();
 343
 344     qemu_fflush(fb);
 345
 346     colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
 347     if (local_err) {
 348         goto out;
 349     }
 350     /*
 351      * We need the size of the VMstate data in Secondary side,
 352      * With which we can decide how much data should be read.
 353      */
 354     colo_send_message_value(s->to_dst_file, COLO_MESSAGE_VMSTATE_SIZE,
 355                             bioc->usage, &local_err);
 356     if (local_err) {
 357         goto out;
 358     }
 359
 360     qemu_put_buffer(s->to_dst_file, bioc->data, bioc->usage);
 361     qemu_fflush(s->to_dst_file);
 362     ret = qemu_file_get_error(s->to_dst_file);
 363     if (ret < 0) {
 364         goto out;
 365     }
 366
 367     colo_receive_check_message(s->rp_state.from_dst_file,
 368                        COLO_MESSAGE_VMSTATE_RECEIVED, &local_err);
 369     if (local_err) {
 370         goto out;
 371     }
 372
 373     colo_receive_check_message(s->rp_state.from_dst_file,
 374                        COLO_MESSAGE_VMSTATE_LOADED, &local_err);
 375     if (local_err) {
 376         goto out;
 377     }
 378
 379     ret = 0;
 380
 381     qemu_mutex_lock_iothread();
 382     vm_start();
 383     qemu_mutex_unlock_iothread();
 384     trace_colo_vm_state_change("stop", "run");
 385
 386 out:
 387     if (local_err) {
 388         error_report_err(local_err);
 389     }
 390     return ret;
 391 }
 392
 393 static void colo_process_checkpoint(MigrationState *s)
 394 {
 395     QIOChannelBuffer *bioc;
 396     QEMUFile *fb = NULL;
 397     int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 398     Error *local_err = NULL;
 399     int ret;
 400
 401     failover_init_state();
 402
 403     s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
 404     if (!s->rp_state.from_dst_file) {
 405         error_report("Open QEMUFile from_dst_file failed");
 406         goto out;
 407     }
 408
 409     /*
 410      * Wait for Secondary finish loading VM states and enter COLO
 411      * restore.
 412      */
 413     colo_receive_check_message(s->rp_state.from_dst_file,
 414                        COLO_MESSAGE_CHECKPOINT_READY, &local_err);
 415     if (local_err) {
 416         goto out;
 417     }
 418     bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
 419     fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
 420     object_unref(OBJECT(bioc));
 421
 422     qemu_mutex_lock_iothread();
 423     vm_start();
 424     qemu_mutex_unlock_iothread();
 425     trace_colo_vm_state_change("stop", "run");
 426
 427     timer_mod(s->colo_delay_timer,
 428             current_time + s->parameters.x_checkpoint_delay);
 429
 430     while (s->state == MIGRATION_STATUS_COLO) {
 431         if (failover_get_state() != FAILOVER_STATUS_NONE) {
 432             error_report("failover request");
 433             goto out;
 434         }
 435
 436         qemu_sem_wait(&s->colo_checkpoint_sem);
 437
 438         ret = colo_do_checkpoint_transaction(s, bioc, fb);
 439         if (ret < 0) {
 440             goto out;
 441         }
 442     }
 443
 444 out:
 445     /* Throw the unreported error message after exited from loop */
 446     if (local_err) {
 447         error_report_err(local_err);
 448     }
 449
 450     if (fb) {
 451         qemu_fclose(fb);
 452     }
 453
 454     timer_del(s->colo_delay_timer);
 455
 456     /* Hope this not to be too long to wait here */
 457     qemu_sem_wait(&s->colo_exit_sem);
 458     qemu_sem_destroy(&s->colo_exit_sem);
 459     /*
 460      * Must be called after failover BH is completed,
 461      * Or the failover BH may shutdown the wrong fd that
 462      * re-used by other threads after we release here.
 463      */
 464     if (s->rp_state.from_dst_file) {
 465         qemu_fclose(s->rp_state.from_dst_file);
 466     }
 467 }
 468
 469 void colo_checkpoint_notify(void *opaque)
 470 {
 471     MigrationState *s = opaque;
 472     int64_t next_notify_time;
 473
 474     qemu_sem_post(&s->colo_checkpoint_sem);
 475     s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 476     next_notify_time = s->colo_checkpoint_time +
 477                     s->parameters.x_checkpoint_delay;
 478     timer_mod(s->colo_delay_timer, next_notify_time);
 479 }
 480
 481 void migrate_start_colo_process(MigrationState *s)
 482 {
 483     qemu_mutex_unlock_iothread();
 484     qemu_sem_init(&s->colo_checkpoint_sem, 0);
 485     s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
 486                                 colo_checkpoint_notify, s);
 487
 488     qemu_sem_init(&s->colo_exit_sem, 0);
 489     migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
 490                       MIGRATION_STATUS_COLO);
 491     colo_process_checkpoint(s);
 492     qemu_mutex_lock_iothread();
 493 }
 494
 495 static void colo_wait_handle_message(QEMUFile *f, int *checkpoint_request,
 496                                      Error **errp)
 497 {
 498     COLOMessage msg;
 499     Error *local_err = NULL;
 500
 501     msg = colo_receive_message(f, &local_err);
 502     if (local_err) {
 503         error_propagate(errp, local_err);
 504         return;
 505     }
 506
 507     switch (msg) {
 508     case COLO_MESSAGE_CHECKPOINT_REQUEST:
 509         *checkpoint_request = 1;
 510         break;
 511     default:
 512         *checkpoint_request = 0;
 513         error_setg(errp, "Got unknown COLO message: %d", msg);
 514         break;
 515     }
 516 }
 517
 518 void *colo_process_incoming_thread(void *opaque)
 519 {
 520     MigrationIncomingState *mis = opaque;
 521     QEMUFile *fb = NULL;
 522     QIOChannelBuffer *bioc = NULL; /* Cache incoming device state */
 523     uint64_t total_size;
 524     uint64_t value;
 525     Error *local_err = NULL;
 526
 527     qemu_sem_init(&mis->colo_incoming_sem, 0);
 528
 529     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 530                       MIGRATION_STATUS_COLO);
 531
 532     failover_init_state();
 533
 534     mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
 535     if (!mis->to_src_file) {
 536         error_report("COLO incoming thread: Open QEMUFile to_src_file failed");
 537         goto out;
 538     }
 539     /*
 540      * Note: the communication between Primary side and Secondary side
 541      * should be sequential, we set the fd to unblocked in migration incoming
 542      * coroutine, and here we are in the COLO incoming thread, so it is ok to
 543      * set the fd back to blocked.
 544      */
 545     qemu_file_set_blocking(mis->from_src_file, true);
 546
 547     bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
 548     fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
 549     object_unref(OBJECT(bioc));
 550
 551     colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
 552                       &local_err);
 553     if (local_err) {
 554         goto out;
 555     }
 556
 557     while (mis->state == MIGRATION_STATUS_COLO) {
 558         int request = 0;
 559
 560         colo_wait_handle_message(mis->from_src_file, &request, &local_err);
 561         if (local_err) {
 562             goto out;
 563         }
 564         assert(request);
 565         if (failover_get_state() != FAILOVER_STATUS_NONE) {
 566             error_report("failover request");
 567             goto out;
 568         }
 569
 570         /* FIXME: This is unnecessary for periodic checkpoint mode */
 571         colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
 572                      &local_err);
 573         if (local_err) {
 574             goto out;
 575         }
 576
 577         colo_receive_check_message(mis->from_src_file,
 578                            COLO_MESSAGE_VMSTATE_SEND, &local_err);
 579         if (local_err) {
 580             goto out;
 581         }
 582
 583         value = colo_receive_message_value(mis->from_src_file,
 584                                  COLO_MESSAGE_VMSTATE_SIZE, &local_err);
 585         if (local_err) {
 586             goto out;
 587         }
 588
 589         /*
 590          * Read VM device state data into channel buffer,
 591          * It's better to re-use the memory allocated.
 592          * Here we need to handle the channel buffer directly.
 593          */
 594         if (value > bioc->capacity) {
 595             bioc->capacity = value;
 596             bioc->data = g_realloc(bioc->data, bioc->capacity);
 597         }
 598         total_size = qemu_get_buffer(mis->from_src_file, bioc->data, value);
 599         if (total_size != value) {
 600             error_report("Got %" PRIu64 " VMState data, less than expected"
 601                         " %" PRIu64, total_size, value);
 602             goto out;
 603         }
 604         bioc->usage = total_size;
 605         qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
 606
 607         colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_RECEIVED,
 608                      &local_err);
 609         if (local_err) {
 610             goto out;
 611         }
 612
 613         qemu_mutex_lock_iothread();
 614         qemu_system_reset(VMRESET_SILENT);
 615         vmstate_loading = true;
 616         if (qemu_loadvm_state(fb) < 0) {
 617             error_report("COLO: loadvm failed");
 618             qemu_mutex_unlock_iothread();
 619             goto out;
 620         }
 621
 622         vmstate_loading = false;
 623         qemu_mutex_unlock_iothread();
 624
 625         if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
 626             failover_set_state(FAILOVER_STATUS_RELAUNCH,
 627                             FAILOVER_STATUS_NONE);
 628             failover_request_active(NULL);
 629             goto out;
 630         }
 631
 632         colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED,
 633                      &local_err);
 634         if (local_err) {
 635             goto out;
 636         }
 637     }
 638
 639 out:
 640     vmstate_loading = false;
 641     /* Throw the unreported error message after exited from loop */
 642     if (local_err) {
 643         error_report_err(local_err);
 644     }
 645
 646     if (fb) {
 647         qemu_fclose(fb);
 648     }
 649
 650     /* Hope this not to be too long to loop here */
 651     qemu_sem_wait(&mis->colo_incoming_sem);
 652     qemu_sem_destroy(&mis->colo_incoming_sem);
 653     /* Must be called after failover BH is completed */
 654     if (mis->to_src_file) {
 655         qemu_fclose(mis->to_src_file);
 656     }
 657     migration_incoming_exit_colo();
 658
 659     return NULL;
 660 }