migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <[email protected]>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 static bool ramblock_is_ignored(RAMBlock *block)
 163 {
 164     return !qemu_ram_is_migratable(block) ||
 165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 166 }
 167
 168 /* Should be holding either ram_list.mutex, or the RCU lock. */
 169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 171         if (ramblock_is_ignored(block)) {} else
 172
 173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 175         if (!qemu_ram_is_migratable(block)) {} else
 176
 177 #undef RAMBLOCK_FOREACH
 178
 179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 180 {
 181     RAMBlock *block;
 182     int ret = 0;
 183
 184     rcu_read_lock();
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     rcu_read_unlock();
 192     return ret;
 193 }
 194
 195 static void ramblock_recv_map_init(void)
 196 {
 197     RAMBlock *rb;
 198
 199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 200         assert(!rb->receivedmap);
 201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 202     }
 203 }
 204
 205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 206 {
 207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 208                     rb->receivedmap);
 209 }
 210
 211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 212 {
 213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 217 {
 218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 219 }
 220
 221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 222                                     size_t nr)
 223 {
 224     bitmap_set_atomic(rb->receivedmap,
 225                       ramblock_recv_bitmap_offset(host_addr, rb),
 226                       nr);
 227 }
 228
 229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 230
 231 /*
 232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 233  *
 234  * Returns >0 if success with sent bytes, or <0 if error.
 235  */
 236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 237                                   const char *block_name)
 238 {
 239     RAMBlock *block = qemu_ram_block_by_name(block_name);
 240     unsigned long *le_bitmap, nbits;
 241     uint64_t size;
 242
 243     if (!block) {
 244         error_report("%s: invalid block name: %s", __func__, block_name);
 245         return -1;
 246     }
 247
 248     nbits = block->used_length >> TARGET_PAGE_BITS;
 249
 250     /*
 251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 252      * machines we may need 4 more bytes for padding (see below
 253      * comment). So extend it a bit before hand.
 254      */
 255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 256
 257     /*
 258      * Always use little endian when sending the bitmap. This is
 259      * required that when source and destination VMs are not using the
 260      * same endianess. (Note: big endian won't work.)
 261      */
 262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 263
 264     /* Size of the bitmap, in bytes */
 265     size = DIV_ROUND_UP(nbits, 8);
 266
 267     /*
 268      * size is always aligned to 8 bytes for 64bit machines, but it
 269      * may not be true for 32bit machines. We need this padding to
 270      * make sure the migration can survive even between 32bit and
 271      * 64bit machines.
 272      */
 273     size = ROUND_UP(size, 8);
 274
 275     qemu_put_be64(file, size);
 276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 277     /*
 278      * Mark as an end, in case the middle part is screwed up due to
 279      * some "misterious" reason.
 280      */
 281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 282     qemu_fflush(file);
 283
 284     g_free(le_bitmap);
 285
 286     if (qemu_file_get_error(file)) {
 287         return qemu_file_get_error(file);
 288     }
 289
 290     return size + sizeof(size);
 291 }
 292
 293 /*
 294  * An outstanding page request, on the source, having been received
 295  * and queued
 296  */
 297 struct RAMSrcPageRequest {
 298     RAMBlock *rb;
 299     hwaddr    offset;
 300     hwaddr    len;
 301
 302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 303 };
 304
 305 /* State of RAM for migration */
 306 struct RAMState {
 307     /* QEMUFile used for this migration */
 308     QEMUFile *f;
 309     /* Last block that we have visited searching for dirty pages */
 310     RAMBlock *last_seen_block;
 311     /* Last block from where we have sent data */
 312     RAMBlock *last_sent_block;
 313     /* Last dirty target page we have sent */
 314     ram_addr_t last_page;
 315     /* last ram version we have seen */
 316     uint32_t last_version;
 317     /* We are in the first round */
 318     bool ram_bulk_stage;
 319     /* The free page optimization is enabled */
 320     bool fpo_enabled;
 321     /* How many times we have dirty too many pages */
 322     int dirty_rate_high_cnt;
 323     /* these variables are used for bitmap sync */
 324     /* last time we did a full bitmap_sync */
 325     int64_t time_last_bitmap_sync;
 326     /* bytes transferred at start_time */
 327     uint64_t bytes_xfer_prev;
 328     /* number of dirty pages since start_time */
 329     uint64_t num_dirty_pages_period;
 330     /* xbzrle misses since the beginning of the period */
 331     uint64_t xbzrle_cache_miss_prev;
 332
 333     /* compression statistics since the beginning of the period */
 334     /* amount of count that no free thread to compress data */
 335     uint64_t compress_thread_busy_prev;
 336     /* amount bytes after compression */
 337     uint64_t compressed_size_prev;
 338     /* amount of compressed pages */
 339     uint64_t compress_pages_prev;
 340
 341     /* total handled target pages at the beginning of period */
 342     uint64_t target_page_count_prev;
 343     /* total handled target pages since start */
 344     uint64_t target_page_count;
 345     /* number of dirty bits in the bitmap */
 346     uint64_t migration_dirty_pages;
 347     /* Protects modification of the bitmap and migration dirty pages */
 348     QemuMutex bitmap_mutex;
 349     /* The RAMBlock used in the last src_page_requests */
 350     RAMBlock *last_req_rb;
 351     /* Queue of outstanding page requests from the destination */
 352     QemuMutex src_page_req_mutex;
 353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 354 };
 355 typedef struct RAMState RAMState;
 356
 357 static RAMState *ram_state;
 358
 359 static NotifierWithReturnList precopy_notifier_list;
 360
 361 void precopy_infrastructure_init(void)
 362 {
 363     notifier_with_return_list_init(&precopy_notifier_list);
 364 }
 365
 366 void precopy_add_notifier(NotifierWithReturn *n)
 367 {
 368     notifier_with_return_list_add(&precopy_notifier_list, n);
 369 }
 370
 371 void precopy_remove_notifier(NotifierWithReturn *n)
 372 {
 373     notifier_with_return_remove(n);
 374 }
 375
 376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 377 {
 378     PrecopyNotifyData pnd;
 379     pnd.reason = reason;
 380     pnd.errp = errp;
 381
 382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 383 }
 384
 385 void precopy_enable_free_page_optimization(void)
 386 {
 387     if (!ram_state) {
 388         return;
 389     }
 390
 391     ram_state->fpo_enabled = true;
 392 }
 393
 394 uint64_t ram_bytes_remaining(void)
 395 {
 396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 397                        0;
 398 }
 399
 400 MigrationStats ram_counters;
 401
 402 /* used by the search for pages to send */
 403 struct PageSearchStatus {
 404     /* Current block being searched */
 405     RAMBlock    *block;
 406     /* Current page to search from */
 407     unsigned long page;
 408     /* Set once we wrap around */
 409     bool         complete_round;
 410 };
 411 typedef struct PageSearchStatus PageSearchStatus;
 412
 413 CompressionStats compression_counters;
 414
 415 struct CompressParam {
 416     bool done;
 417     bool quit;
 418     bool zero_page;
 419     QEMUFile *file;
 420     QemuMutex mutex;
 421     QemuCond cond;
 422     RAMBlock *block;
 423     ram_addr_t offset;
 424
 425     /* internally used fields */
 426     z_stream stream;
 427     uint8_t *originbuf;
 428 };
 429 typedef struct CompressParam CompressParam;
 430
 431 struct DecompressParam {
 432     bool done;
 433     bool quit;
 434     QemuMutex mutex;
 435     QemuCond cond;
 436     void *des;
 437     uint8_t *compbuf;
 438     int len;
 439     z_stream stream;
 440 };
 441 typedef struct DecompressParam DecompressParam;
 442
 443 static CompressParam *comp_param;
 444 static QemuThread *compress_threads;
 445 /* comp_done_cond is used to wake up the migration thread when
 446  * one of the compression threads has finished the compression.
 447  * comp_done_lock is used to co-work with comp_done_cond.
 448  */
 449 static QemuMutex comp_done_lock;
 450 static QemuCond comp_done_cond;
 451 /* The empty QEMUFileOps will be used by file in CompressParam */
 452 static const QEMUFileOps empty_ops = { };
 453
 454 static QEMUFile *decomp_file;
 455 static DecompressParam *decomp_param;
 456 static QemuThread *decompress_threads;
 457 static QemuMutex decomp_done_lock;
 458 static QemuCond decomp_done_cond;
 459
 460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 461                                  ram_addr_t offset, uint8_t *source_buf);
 462
 463 static void *do_data_compress(void *opaque)
 464 {
 465     CompressParam *param = opaque;
 466     RAMBlock *block;
 467     ram_addr_t offset;
 468     bool zero_page;
 469
 470     qemu_mutex_lock(&param->mutex);
 471     while (!param->quit) {
 472         if (param->block) {
 473             block = param->block;
 474             offset = param->offset;
 475             param->block = NULL;
 476             qemu_mutex_unlock(&param->mutex);
 477
 478             zero_page = do_compress_ram_page(param->file, &param->stream,
 479                                              block, offset, param->originbuf);
 480
 481             qemu_mutex_lock(&comp_done_lock);
 482             param->done = true;
 483             param->zero_page = zero_page;
 484             qemu_cond_signal(&comp_done_cond);
 485             qemu_mutex_unlock(&comp_done_lock);
 486
 487             qemu_mutex_lock(&param->mutex);
 488         } else {
 489             qemu_cond_wait(&param->cond, &param->mutex);
 490         }
 491     }
 492     qemu_mutex_unlock(&param->mutex);
 493
 494     return NULL;
 495 }
 496
 497 static void compress_threads_save_cleanup(void)
 498 {
 499     int i, thread_count;
 500
 501     if (!migrate_use_compression() || !comp_param) {
 502         return;
 503     }
 504
 505     thread_count = migrate_compress_threads();
 506     for (i = 0; i < thread_count; i++) {
 507         /*
 508          * we use it as a indicator which shows if the thread is
 509          * properly init'd or not
 510          */
 511         if (!comp_param[i].file) {
 512             break;
 513         }
 514
 515         qemu_mutex_lock(&comp_param[i].mutex);
 516         comp_param[i].quit = true;
 517         qemu_cond_signal(&comp_param[i].cond);
 518         qemu_mutex_unlock(&comp_param[i].mutex);
 519
 520         qemu_thread_join(compress_threads + i);
 521         qemu_mutex_destroy(&comp_param[i].mutex);
 522         qemu_cond_destroy(&comp_param[i].cond);
 523         deflateEnd(&comp_param[i].stream);
 524         g_free(comp_param[i].originbuf);
 525         qemu_fclose(comp_param[i].file);
 526         comp_param[i].file = NULL;
 527     }
 528     qemu_mutex_destroy(&comp_done_lock);
 529     qemu_cond_destroy(&comp_done_cond);
 530     g_free(compress_threads);
 531     g_free(comp_param);
 532     compress_threads = NULL;
 533     comp_param = NULL;
 534 }
 535
 536 static int compress_threads_save_setup(void)
 537 {
 538     int i, thread_count;
 539
 540     if (!migrate_use_compression()) {
 541         return 0;
 542     }
 543     thread_count = migrate_compress_threads();
 544     compress_threads = g_new0(QemuThread, thread_count);
 545     comp_param = g_new0(CompressParam, thread_count);
 546     qemu_cond_init(&comp_done_cond);
 547     qemu_mutex_init(&comp_done_lock);
 548     for (i = 0; i < thread_count; i++) {
 549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 550         if (!comp_param[i].originbuf) {
 551             goto exit;
 552         }
 553
 554         if (deflateInit(&comp_param[i].stream,
 555                         migrate_compress_level()) != Z_OK) {
 556             g_free(comp_param[i].originbuf);
 557             goto exit;
 558         }
 559
 560         /* comp_param[i].file is just used as a dummy buffer to save data,
 561          * set its ops to empty.
 562          */
 563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 564         comp_param[i].done = true;
 565         comp_param[i].quit = false;
 566         qemu_mutex_init(&comp_param[i].mutex);
 567         qemu_cond_init(&comp_param[i].cond);
 568         qemu_thread_create(compress_threads + i, "compress",
 569                            do_data_compress, comp_param + i,
 570                            QEMU_THREAD_JOINABLE);
 571     }
 572     return 0;
 573
 574 exit:
 575     compress_threads_save_cleanup();
 576     return -1;
 577 }
 578
 579 /* Multiple fd's */
 580
 581 #define MULTIFD_MAGIC 0x11223344U
 582 #define MULTIFD_VERSION 1
 583
 584 #define MULTIFD_FLAG_SYNC (1 << 0)
 585
 586 /* This value needs to be a multiple of qemu_target_page_size() */
 587 #define MULTIFD_PACKET_SIZE (512 * 1024)
 588
 589 typedef struct {
 590     uint32_t magic;
 591     uint32_t version;
 592     unsigned char uuid[16]; /* QemuUUID */
 593     uint8_t id;
 594     uint8_t unused1[7];     /* Reserved for future use */
 595     uint64_t unused2[4];    /* Reserved for future use */
 596 } __attribute__((packed)) MultiFDInit_t;
 597
 598 typedef struct {
 599     uint32_t magic;
 600     uint32_t version;
 601     uint32_t flags;
 602     /* maximum number of allocated pages */
 603     uint32_t pages_alloc;
 604     uint32_t pages_used;
 605     /* size of the next packet that contains pages */
 606     uint32_t next_packet_size;
 607     uint64_t packet_num;
 608     uint64_t unused[4];    /* Reserved for future use */
 609     char ramblock[256];
 610     uint64_t offset[];
 611 } __attribute__((packed)) MultiFDPacket_t;
 612
 613 typedef struct {
 614     /* number of used pages */
 615     uint32_t used;
 616     /* number of allocated pages */
 617     uint32_t allocated;
 618     /* global number of generated multifd packets */
 619     uint64_t packet_num;
 620     /* offset of each page */
 621     ram_addr_t *offset;
 622     /* pointer to each page */
 623     struct iovec *iov;
 624     RAMBlock *block;
 625 } MultiFDPages_t;
 626
 627 typedef struct {
 628     /* this fields are not changed once the thread is created */
 629     /* channel number */
 630     uint8_t id;
 631     /* channel thread name */
 632     char *name;
 633     /* channel thread id */
 634     QemuThread thread;
 635     /* communication channel */
 636     QIOChannel *c;
 637     /* sem where to wait for more work */
 638     QemuSemaphore sem;
 639     /* this mutex protects the following parameters */
 640     QemuMutex mutex;
 641     /* is this channel thread running */
 642     bool running;
 643     /* should this thread finish */
 644     bool quit;
 645     /* thread has work to do */
 646     int pending_job;
 647     /* array of pages to sent */
 648     MultiFDPages_t *pages;
 649     /* packet allocated len */
 650     uint32_t packet_len;
 651     /* pointer to the packet */
 652     MultiFDPacket_t *packet;
 653     /* multifd flags for each packet */
 654     uint32_t flags;
 655     /* size of the next packet that contains pages */
 656     uint32_t next_packet_size;
 657     /* global number of generated multifd packets */
 658     uint64_t packet_num;
 659     /* thread local variables */
 660     /* packets sent through this channel */
 661     uint64_t num_packets;
 662     /* pages sent through this channel */
 663     uint64_t num_pages;
 664     /* syncs main thread and channels */
 665     QemuSemaphore sem_sync;
 666 }  MultiFDSendParams;
 667
 668 typedef struct {
 669     /* this fields are not changed once the thread is created */
 670     /* channel number */
 671     uint8_t id;
 672     /* channel thread name */
 673     char *name;
 674     /* channel thread id */
 675     QemuThread thread;
 676     /* communication channel */
 677     QIOChannel *c;
 678     /* this mutex protects the following parameters */
 679     QemuMutex mutex;
 680     /* is this channel thread running */
 681     bool running;
 682     /* should this thread finish */
 683     bool quit;
 684     /* array of pages to receive */
 685     MultiFDPages_t *pages;
 686     /* packet allocated len */
 687     uint32_t packet_len;
 688     /* pointer to the packet */
 689     MultiFDPacket_t *packet;
 690     /* multifd flags for each packet */
 691     uint32_t flags;
 692     /* global number of generated multifd packets */
 693     uint64_t packet_num;
 694     /* thread local variables */
 695     /* size of the next packet that contains pages */
 696     uint32_t next_packet_size;
 697     /* packets sent through this channel */
 698     uint64_t num_packets;
 699     /* pages sent through this channel */
 700     uint64_t num_pages;
 701     /* syncs main thread and channels */
 702     QemuSemaphore sem_sync;
 703 } MultiFDRecvParams;
 704
 705 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 706 {
 707     MultiFDInit_t msg;
 708     int ret;
 709
 710     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 711     msg.version = cpu_to_be32(MULTIFD_VERSION);
 712     msg.id = p->id;
 713     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 714
 715     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 716     if (ret != 0) {
 717         return -1;
 718     }
 719     return 0;
 720 }
 721
 722 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 723 {
 724     MultiFDInit_t msg;
 725     int ret;
 726
 727     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 728     if (ret != 0) {
 729         return -1;
 730     }
 731
 732     msg.magic = be32_to_cpu(msg.magic);
 733     msg.version = be32_to_cpu(msg.version);
 734
 735     if (msg.magic != MULTIFD_MAGIC) {
 736         error_setg(errp, "multifd: received packet magic %x "
 737                    "expected %x", msg.magic, MULTIFD_MAGIC);
 738         return -1;
 739     }
 740
 741     if (msg.version != MULTIFD_VERSION) {
 742         error_setg(errp, "multifd: received packet version %d "
 743                    "expected %d", msg.version, MULTIFD_VERSION);
 744         return -1;
 745     }
 746
 747     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 748         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 749         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 750
 751         error_setg(errp, "multifd: received uuid '%s' and expected "
 752                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 753         g_free(uuid);
 754         g_free(msg_uuid);
 755         return -1;
 756     }
 757
 758     if (msg.id > migrate_multifd_channels()) {
 759         error_setg(errp, "multifd: received channel version %d "
 760                    "expected %d", msg.version, MULTIFD_VERSION);
 761         return -1;
 762     }
 763
 764     return msg.id;
 765 }
 766
 767 static MultiFDPages_t *multifd_pages_init(size_t size)
 768 {
 769     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 770
 771     pages->allocated = size;
 772     pages->iov = g_new0(struct iovec, size);
 773     pages->offset = g_new0(ram_addr_t, size);
 774
 775     return pages;
 776 }
 777
 778 static void multifd_pages_clear(MultiFDPages_t *pages)
 779 {
 780     pages->used = 0;
 781     pages->allocated = 0;
 782     pages->packet_num = 0;
 783     pages->block = NULL;
 784     g_free(pages->iov);
 785     pages->iov = NULL;
 786     g_free(pages->offset);
 787     pages->offset = NULL;
 788     g_free(pages);
 789 }
 790
 791 static void multifd_send_fill_packet(MultiFDSendParams *p)
 792 {
 793     MultiFDPacket_t *packet = p->packet;
 794     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 795     int i;
 796
 797     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 798     packet->version = cpu_to_be32(MULTIFD_VERSION);
 799     packet->flags = cpu_to_be32(p->flags);
 800     packet->pages_alloc = cpu_to_be32(page_max);
 801     packet->pages_used = cpu_to_be32(p->pages->used);
 802     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 803     packet->packet_num = cpu_to_be64(p->packet_num);
 804
 805     if (p->pages->block) {
 806         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 807     }
 808
 809     for (i = 0; i < p->pages->used; i++) {
 810         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 811     }
 812 }
 813
 814 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 815 {
 816     MultiFDPacket_t *packet = p->packet;
 817     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 818     RAMBlock *block;
 819     int i;
 820
 821     packet->magic = be32_to_cpu(packet->magic);
 822     if (packet->magic != MULTIFD_MAGIC) {
 823         error_setg(errp, "multifd: received packet "
 824                    "magic %x and expected magic %x",
 825                    packet->magic, MULTIFD_MAGIC);
 826         return -1;
 827     }
 828
 829     packet->version = be32_to_cpu(packet->version);
 830     if (packet->version != MULTIFD_VERSION) {
 831         error_setg(errp, "multifd: received packet "
 832                    "version %d and expected version %d",
 833                    packet->version, MULTIFD_VERSION);
 834         return -1;
 835     }
 836
 837     p->flags = be32_to_cpu(packet->flags);
 838
 839     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 840     /*
 841      * If we recevied a packet that is 100 times bigger than expected
 842      * just stop migration.  It is a magic number.
 843      */
 844     if (packet->pages_alloc > pages_max * 100) {
 845         error_setg(errp, "multifd: received packet "
 846                    "with size %d and expected a maximum size of %d",
 847                    packet->pages_alloc, pages_max * 100) ;
 848         return -1;
 849     }
 850     /*
 851      * We received a packet that is bigger than expected but inside
 852      * reasonable limits (see previous comment).  Just reallocate.
 853      */
 854     if (packet->pages_alloc > p->pages->allocated) {
 855         multifd_pages_clear(p->pages);
 856         p->pages = multifd_pages_init(packet->pages_alloc);
 857     }
 858
 859     p->pages->used = be32_to_cpu(packet->pages_used);
 860     if (p->pages->used > packet->pages_alloc) {
 861         error_setg(errp, "multifd: received packet "
 862                    "with %d pages and expected maximum pages are %d",
 863                    p->pages->used, packet->pages_alloc) ;
 864         return -1;
 865     }
 866
 867     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 868     p->packet_num = be64_to_cpu(packet->packet_num);
 869
 870     if (p->pages->used) {
 871         /* make sure that ramblock is 0 terminated */
 872         packet->ramblock[255] = 0;
 873         block = qemu_ram_block_by_name(packet->ramblock);
 874         if (!block) {
 875             error_setg(errp, "multifd: unknown ram block %s",
 876                        packet->ramblock);
 877             return -1;
 878         }
 879     }
 880
 881     for (i = 0; i < p->pages->used; i++) {
 882         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 883
 884         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 885             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 886                        " (max " RAM_ADDR_FMT ")",
 887                        offset, block->max_length);
 888             return -1;
 889         }
 890         p->pages->iov[i].iov_base = block->host + offset;
 891         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 892     }
 893
 894     return 0;
 895 }
 896
 897 struct {
 898     MultiFDSendParams *params;
 899     /* array of pages to sent */
 900     MultiFDPages_t *pages;
 901     /* global number of generated multifd packets */
 902     uint64_t packet_num;
 903     /* send channels ready */
 904     QemuSemaphore channels_ready;
 905 } *multifd_send_state;
 906
 907 /*
 908  * How we use multifd_send_state->pages and channel->pages?
 909  *
 910  * We create a pages for each channel, and a main one.  Each time that
 911  * we need to send a batch of pages we interchange the ones between
 912  * multifd_send_state and the channel that is sending it.  There are
 913  * two reasons for that:
 914  *    - to not have to do so many mallocs during migration
 915  *    - to make easier to know what to free at the end of migration
 916  *
 917  * This way we always know who is the owner of each "pages" struct,
 918  * and we don't need any locking.  It belongs to the migration thread
 919  * or to the channel thread.  Switching is safe because the migration
 920  * thread is using the channel mutex when changing it, and the channel
 921  * have to had finish with its own, otherwise pending_job can't be
 922  * false.
 923  */
 924
 925 static int multifd_send_pages(RAMState *rs)
 926 {
 927     int i;
 928     static int next_channel;
 929     MultiFDSendParams *p = NULL; /* make happy gcc */
 930     MultiFDPages_t *pages = multifd_send_state->pages;
 931     uint64_t transferred;
 932
 933     qemu_sem_wait(&multifd_send_state->channels_ready);
 934     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 935         p = &multifd_send_state->params[i];
 936
 937         qemu_mutex_lock(&p->mutex);
 938         if (p->quit) {
 939             error_report("%s: channel %d has already quit!", __func__, i);
 940             qemu_mutex_unlock(&p->mutex);
 941             return -1;
 942         }
 943         if (!p->pending_job) {
 944             p->pending_job++;
 945             next_channel = (i + 1) % migrate_multifd_channels();
 946             break;
 947         }
 948         qemu_mutex_unlock(&p->mutex);
 949     }
 950     p->pages->used = 0;
 951
 952     p->packet_num = multifd_send_state->packet_num++;
 953     p->pages->block = NULL;
 954     multifd_send_state->pages = p->pages;
 955     p->pages = pages;
 956     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 957     qemu_file_update_transfer(rs->f, transferred);
 958     ram_counters.multifd_bytes += transferred;
 959     ram_counters.transferred += transferred;;
 960     qemu_mutex_unlock(&p->mutex);
 961     qemu_sem_post(&p->sem);
 962
 963     return 1;
 964 }
 965
 966 static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 967 {
 968     MultiFDPages_t *pages = multifd_send_state->pages;
 969
 970     if (!pages->block) {
 971         pages->block = block;
 972     }
 973
 974     if (pages->block == block) {
 975         pages->offset[pages->used] = offset;
 976         pages->iov[pages->used].iov_base = block->host + offset;
 977         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 978         pages->used++;
 979
 980         if (pages->used < pages->allocated) {
 981             return 1;
 982         }
 983     }
 984
 985     if (multifd_send_pages(rs) < 0) {
 986         return -1;
 987     }
 988
 989     if (pages->block != block) {
 990         return  multifd_queue_page(rs, block, offset);
 991     }
 992
 993     return 1;
 994 }
 995
 996 static void multifd_send_terminate_threads(Error *err)
 997 {
 998     int i;
 999
1000     trace_multifd_send_terminate_threads(err != NULL);
1001
1002     if (err) {
1003         MigrationState *s = migrate_get_current();
1004         migrate_set_error(s, err);
1005         if (s->state == MIGRATION_STATUS_SETUP ||
1006             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1007             s->state == MIGRATION_STATUS_DEVICE ||
1008             s->state == MIGRATION_STATUS_ACTIVE) {
1009             migrate_set_state(&s->state, s->state,
1010                               MIGRATION_STATUS_FAILED);
1011         }
1012     }
1013
1014     for (i = 0; i < migrate_multifd_channels(); i++) {
1015         MultiFDSendParams *p = &multifd_send_state->params[i];
1016
1017         qemu_mutex_lock(&p->mutex);
1018         p->quit = true;
1019         qemu_sem_post(&p->sem);
1020         qemu_mutex_unlock(&p->mutex);
1021     }
1022 }
1023
1024 void multifd_save_cleanup(void)
1025 {
1026     int i;
1027
1028     if (!migrate_use_multifd()) {
1029         return;
1030     }
1031     multifd_send_terminate_threads(NULL);
1032     for (i = 0; i < migrate_multifd_channels(); i++) {
1033         MultiFDSendParams *p = &multifd_send_state->params[i];
1034
1035         if (p->running) {
1036             qemu_thread_join(&p->thread);
1037         }
1038         socket_send_channel_destroy(p->c);
1039         p->c = NULL;
1040         qemu_mutex_destroy(&p->mutex);
1041         qemu_sem_destroy(&p->sem);
1042         qemu_sem_destroy(&p->sem_sync);
1043         g_free(p->name);
1044         p->name = NULL;
1045         multifd_pages_clear(p->pages);
1046         p->pages = NULL;
1047         p->packet_len = 0;
1048         g_free(p->packet);
1049         p->packet = NULL;
1050     }
1051     qemu_sem_destroy(&multifd_send_state->channels_ready);
1052     g_free(multifd_send_state->params);
1053     multifd_send_state->params = NULL;
1054     multifd_pages_clear(multifd_send_state->pages);
1055     multifd_send_state->pages = NULL;
1056     g_free(multifd_send_state);
1057     multifd_send_state = NULL;
1058 }
1059
1060 static void multifd_send_sync_main(RAMState *rs)
1061 {
1062     int i;
1063
1064     if (!migrate_use_multifd()) {
1065         return;
1066     }
1067     if (multifd_send_state->pages->used) {
1068         if (multifd_send_pages(rs) < 0) {
1069             error_report("%s: multifd_send_pages fail", __func__);
1070             return;
1071         }
1072     }
1073     for (i = 0; i < migrate_multifd_channels(); i++) {
1074         MultiFDSendParams *p = &multifd_send_state->params[i];
1075
1076         trace_multifd_send_sync_main_signal(p->id);
1077
1078         qemu_mutex_lock(&p->mutex);
1079
1080         if (p->quit) {
1081             error_report("%s: channel %d has already quit", __func__, i);
1082             qemu_mutex_unlock(&p->mutex);
1083             return;
1084         }
1085
1086         p->packet_num = multifd_send_state->packet_num++;
1087         p->flags |= MULTIFD_FLAG_SYNC;
1088         p->pending_job++;
1089         qemu_file_update_transfer(rs->f, p->packet_len);
1090         ram_counters.multifd_bytes += p->packet_len;
1091         ram_counters.transferred += p->packet_len;
1092         qemu_mutex_unlock(&p->mutex);
1093         qemu_sem_post(&p->sem);
1094     }
1095     for (i = 0; i < migrate_multifd_channels(); i++) {
1096         MultiFDSendParams *p = &multifd_send_state->params[i];
1097
1098         trace_multifd_send_sync_main_wait(p->id);
1099         qemu_sem_wait(&p->sem_sync);
1100     }
1101     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1102 }
1103
1104 static void *multifd_send_thread(void *opaque)
1105 {
1106     MultiFDSendParams *p = opaque;
1107     Error *local_err = NULL;
1108     int ret = 0;
1109     uint32_t flags = 0;
1110
1111     trace_multifd_send_thread_start(p->id);
1112     rcu_register_thread();
1113
1114     if (multifd_send_initial_packet(p, &local_err) < 0) {
1115         goto out;
1116     }
1117     /* initial packet */
1118     p->num_packets = 1;
1119
1120     while (true) {
1121         qemu_sem_wait(&p->sem);
1122         qemu_mutex_lock(&p->mutex);
1123
1124         if (p->pending_job) {
1125             uint32_t used = p->pages->used;
1126             uint64_t packet_num = p->packet_num;
1127             flags = p->flags;
1128
1129             p->next_packet_size = used * qemu_target_page_size();
1130             multifd_send_fill_packet(p);
1131             p->flags = 0;
1132             p->num_packets++;
1133             p->num_pages += used;
1134             p->pages->used = 0;
1135             qemu_mutex_unlock(&p->mutex);
1136
1137             trace_multifd_send(p->id, packet_num, used, flags,
1138                                p->next_packet_size);
1139
1140             ret = qio_channel_write_all(p->c, (void *)p->packet,
1141                                         p->packet_len, &local_err);
1142             if (ret != 0) {
1143                 break;
1144             }
1145
1146             if (used) {
1147                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1148                                              used, &local_err);
1149                 if (ret != 0) {
1150                     break;
1151                 }
1152             }
1153
1154             qemu_mutex_lock(&p->mutex);
1155             p->pending_job--;
1156             qemu_mutex_unlock(&p->mutex);
1157
1158             if (flags & MULTIFD_FLAG_SYNC) {
1159                 qemu_sem_post(&p->sem_sync);
1160             }
1161             qemu_sem_post(&multifd_send_state->channels_ready);
1162         } else if (p->quit) {
1163             qemu_mutex_unlock(&p->mutex);
1164             break;
1165         } else {
1166             qemu_mutex_unlock(&p->mutex);
1167             /* sometimes there are spurious wakeups */
1168         }
1169     }
1170
1171 out:
1172     if (local_err) {
1173         multifd_send_terminate_threads(local_err);
1174     }
1175
1176     /*
1177      * Error happen, I will exit, but I can't just leave, tell
1178      * who pay attention to me.
1179      */
1180     if (ret != 0) {
1181         if (flags & MULTIFD_FLAG_SYNC) {
1182             qemu_sem_post(&p->sem_sync);
1183         }
1184         qemu_sem_post(&multifd_send_state->channels_ready);
1185     }
1186
1187     qemu_mutex_lock(&p->mutex);
1188     p->running = false;
1189     qemu_mutex_unlock(&p->mutex);
1190
1191     rcu_unregister_thread();
1192     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1193
1194     return NULL;
1195 }
1196
1197 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1198 {
1199     MultiFDSendParams *p = opaque;
1200     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1201     Error *local_err = NULL;
1202
1203     if (qio_task_propagate_error(task, &local_err)) {
1204         migrate_set_error(migrate_get_current(), local_err);
1205         multifd_save_cleanup();
1206     } else {
1207         p->c = QIO_CHANNEL(sioc);
1208         qio_channel_set_delay(p->c, false);
1209         p->running = true;
1210         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1211                            QEMU_THREAD_JOINABLE);
1212     }
1213 }
1214
1215 int multifd_save_setup(void)
1216 {
1217     int thread_count;
1218     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1219     uint8_t i;
1220
1221     if (!migrate_use_multifd()) {
1222         return 0;
1223     }
1224     thread_count = migrate_multifd_channels();
1225     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1226     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1227     multifd_send_state->pages = multifd_pages_init(page_count);
1228     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1229
1230     for (i = 0; i < thread_count; i++) {
1231         MultiFDSendParams *p = &multifd_send_state->params[i];
1232
1233         qemu_mutex_init(&p->mutex);
1234         qemu_sem_init(&p->sem, 0);
1235         qemu_sem_init(&p->sem_sync, 0);
1236         p->quit = false;
1237         p->pending_job = 0;
1238         p->id = i;
1239         p->pages = multifd_pages_init(page_count);
1240         p->packet_len = sizeof(MultiFDPacket_t)
1241                       + sizeof(ram_addr_t) * page_count;
1242         p->packet = g_malloc0(p->packet_len);
1243         p->name = g_strdup_printf("multifdsend_%d", i);
1244         socket_send_channel_create(multifd_new_send_channel_async, p);
1245     }
1246     return 0;
1247 }
1248
1249 struct {
1250     MultiFDRecvParams *params;
1251     /* number of created threads */
1252     int count;
1253     /* syncs main thread and channels */
1254     QemuSemaphore sem_sync;
1255     /* global number of generated multifd packets */
1256     uint64_t packet_num;
1257 } *multifd_recv_state;
1258
1259 static void multifd_recv_terminate_threads(Error *err)
1260 {
1261     int i;
1262
1263     trace_multifd_recv_terminate_threads(err != NULL);
1264
1265     if (err) {
1266         MigrationState *s = migrate_get_current();
1267         migrate_set_error(s, err);
1268         if (s->state == MIGRATION_STATUS_SETUP ||
1269             s->state == MIGRATION_STATUS_ACTIVE) {
1270             migrate_set_state(&s->state, s->state,
1271                               MIGRATION_STATUS_FAILED);
1272         }
1273     }
1274
1275     for (i = 0; i < migrate_multifd_channels(); i++) {
1276         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1277
1278         qemu_mutex_lock(&p->mutex);
1279         p->quit = true;
1280         /* We could arrive here for two reasons:
1281            - normal quit, i.e. everything went fine, just finished
1282            - error quit: We close the channels so the channel threads
1283              finish the qio_channel_read_all_eof() */
1284         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1285         qemu_mutex_unlock(&p->mutex);
1286     }
1287 }
1288
1289 int multifd_load_cleanup(Error **errp)
1290 {
1291     int i;
1292     int ret = 0;
1293
1294     if (!migrate_use_multifd()) {
1295         return 0;
1296     }
1297     multifd_recv_terminate_threads(NULL);
1298     for (i = 0; i < migrate_multifd_channels(); i++) {
1299         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1300
1301         if (p->running) {
1302             p->quit = true;
1303             /*
1304              * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1305              * however try to wakeup it without harm in cleanup phase.
1306              */
1307             qemu_sem_post(&p->sem_sync);
1308             qemu_thread_join(&p->thread);
1309         }
1310         object_unref(OBJECT(p->c));
1311         p->c = NULL;
1312         qemu_mutex_destroy(&p->mutex);
1313         qemu_sem_destroy(&p->sem_sync);
1314         g_free(p->name);
1315         p->name = NULL;
1316         multifd_pages_clear(p->pages);
1317         p->pages = NULL;
1318         p->packet_len = 0;
1319         g_free(p->packet);
1320         p->packet = NULL;
1321     }
1322     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1323     g_free(multifd_recv_state->params);
1324     multifd_recv_state->params = NULL;
1325     g_free(multifd_recv_state);
1326     multifd_recv_state = NULL;
1327
1328     return ret;
1329 }
1330
1331 static void multifd_recv_sync_main(void)
1332 {
1333     int i;
1334
1335     if (!migrate_use_multifd()) {
1336         return;
1337     }
1338     for (i = 0; i < migrate_multifd_channels(); i++) {
1339         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1340
1341         trace_multifd_recv_sync_main_wait(p->id);
1342         qemu_sem_wait(&multifd_recv_state->sem_sync);
1343     }
1344     for (i = 0; i < migrate_multifd_channels(); i++) {
1345         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1346
1347         qemu_mutex_lock(&p->mutex);
1348         if (multifd_recv_state->packet_num < p->packet_num) {
1349             multifd_recv_state->packet_num = p->packet_num;
1350         }
1351         qemu_mutex_unlock(&p->mutex);
1352         trace_multifd_recv_sync_main_signal(p->id);
1353         qemu_sem_post(&p->sem_sync);
1354     }
1355     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1356 }
1357
1358 static void *multifd_recv_thread(void *opaque)
1359 {
1360     MultiFDRecvParams *p = opaque;
1361     Error *local_err = NULL;
1362     int ret;
1363
1364     trace_multifd_recv_thread_start(p->id);
1365     rcu_register_thread();
1366
1367     while (true) {
1368         uint32_t used;
1369         uint32_t flags;
1370
1371         if (p->quit) {
1372             break;
1373         }
1374
1375         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1376                                        p->packet_len, &local_err);
1377         if (ret == 0) {   /* EOF */
1378             break;
1379         }
1380         if (ret == -1) {   /* Error */
1381             break;
1382         }
1383
1384         qemu_mutex_lock(&p->mutex);
1385         ret = multifd_recv_unfill_packet(p, &local_err);
1386         if (ret) {
1387             qemu_mutex_unlock(&p->mutex);
1388             break;
1389         }
1390
1391         used = p->pages->used;
1392         flags = p->flags;
1393         trace_multifd_recv(p->id, p->packet_num, used, flags,
1394                            p->next_packet_size);
1395         p->num_packets++;
1396         p->num_pages += used;
1397         qemu_mutex_unlock(&p->mutex);
1398
1399         if (used) {
1400             ret = qio_channel_readv_all(p->c, p->pages->iov,
1401                                         used, &local_err);
1402             if (ret != 0) {
1403                 break;
1404             }
1405         }
1406
1407         if (flags & MULTIFD_FLAG_SYNC) {
1408             qemu_sem_post(&multifd_recv_state->sem_sync);
1409             qemu_sem_wait(&p->sem_sync);
1410         }
1411     }
1412
1413     if (local_err) {
1414         multifd_recv_terminate_threads(local_err);
1415     }
1416     qemu_mutex_lock(&p->mutex);
1417     p->running = false;
1418     qemu_mutex_unlock(&p->mutex);
1419
1420     rcu_unregister_thread();
1421     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1422
1423     return NULL;
1424 }
1425
1426 int multifd_load_setup(void)
1427 {
1428     int thread_count;
1429     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1430     uint8_t i;
1431
1432     if (!migrate_use_multifd()) {
1433         return 0;
1434     }
1435     thread_count = migrate_multifd_channels();
1436     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1437     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1438     atomic_set(&multifd_recv_state->count, 0);
1439     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1440
1441     for (i = 0; i < thread_count; i++) {
1442         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1443
1444         qemu_mutex_init(&p->mutex);
1445         qemu_sem_init(&p->sem_sync, 0);
1446         p->quit = false;
1447         p->id = i;
1448         p->pages = multifd_pages_init(page_count);
1449         p->packet_len = sizeof(MultiFDPacket_t)
1450                       + sizeof(ram_addr_t) * page_count;
1451         p->packet = g_malloc0(p->packet_len);
1452         p->name = g_strdup_printf("multifdrecv_%d", i);
1453     }
1454     return 0;
1455 }
1456
1457 bool multifd_recv_all_channels_created(void)
1458 {
1459     int thread_count = migrate_multifd_channels();
1460
1461     if (!migrate_use_multifd()) {
1462         return true;
1463     }
1464
1465     return thread_count == atomic_read(&multifd_recv_state->count);
1466 }
1467
1468 /*
1469  * Try to receive all multifd channels to get ready for the migration.
1470  * - Return true and do not set @errp when correctly receving all channels;
1471  * - Return false and do not set @errp when correctly receiving the current one;
1472  * - Return false and set @errp when failing to receive the current channel.
1473  */
1474 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1475 {
1476     MultiFDRecvParams *p;
1477     Error *local_err = NULL;
1478     int id;
1479
1480     id = multifd_recv_initial_packet(ioc, &local_err);
1481     if (id < 0) {
1482         multifd_recv_terminate_threads(local_err);
1483         error_propagate_prepend(errp, local_err,
1484                                 "failed to receive packet"
1485                                 " via multifd channel %d: ",
1486                                 atomic_read(&multifd_recv_state->count));
1487         return false;
1488     }
1489
1490     p = &multifd_recv_state->params[id];
1491     if (p->c != NULL) {
1492         error_setg(&local_err, "multifd: received id '%d' already setup'",
1493                    id);
1494         multifd_recv_terminate_threads(local_err);
1495         error_propagate(errp, local_err);
1496         return false;
1497     }
1498     p->c = ioc;
1499     object_ref(OBJECT(ioc));
1500     /* initial packet */
1501     p->num_packets = 1;
1502
1503     p->running = true;
1504     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1505                        QEMU_THREAD_JOINABLE);
1506     atomic_inc(&multifd_recv_state->count);
1507     return atomic_read(&multifd_recv_state->count) ==
1508            migrate_multifd_channels();
1509 }
1510
1511 /**
1512  * save_page_header: write page header to wire
1513  *
1514  * If this is the 1st block, it also writes the block identification
1515  *
1516  * Returns the number of bytes written
1517  *
1518  * @f: QEMUFile where to send the data
1519  * @block: block that contains the page we want to send
1520  * @offset: offset inside the block for the page
1521  *          in the lower bits, it contains flags
1522  */
1523 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1524                                ram_addr_t offset)
1525 {
1526     size_t size, len;
1527
1528     if (block == rs->last_sent_block) {
1529         offset |= RAM_SAVE_FLAG_CONTINUE;
1530     }
1531     qemu_put_be64(f, offset);
1532     size = 8;
1533
1534     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1535         len = strlen(block->idstr);
1536         qemu_put_byte(f, len);
1537         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1538         size += 1 + len;
1539         rs->last_sent_block = block;
1540     }
1541     return size;
1542 }
1543
1544 /**
1545  * mig_throttle_guest_down: throotle down the guest
1546  *
1547  * Reduce amount of guest cpu execution to hopefully slow down memory
1548  * writes. If guest dirty memory rate is reduced below the rate at
1549  * which we can transfer pages to the destination then we should be
1550  * able to complete migration. Some workloads dirty memory way too
1551  * fast and will not effectively converge, even with auto-converge.
1552  */
1553 static void mig_throttle_guest_down(void)
1554 {
1555     MigrationState *s = migrate_get_current();
1556     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1557     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1558     int pct_max = s->parameters.max_cpu_throttle;
1559
1560     /* We have not started throttling yet. Let's start it. */
1561     if (!cpu_throttle_active()) {
1562         cpu_throttle_set(pct_initial);
1563     } else {
1564         /* Throttling already on, just increase the rate */
1565         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1566                          pct_max));
1567     }
1568 }
1569
1570 /**
1571  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1572  *
1573  * @rs: current RAM state
1574  * @current_addr: address for the zero page
1575  *
1576  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1577  * The important thing is that a stale (not-yet-0'd) page be replaced
1578  * by the new data.
1579  * As a bonus, if the page wasn't in the cache it gets added so that
1580  * when a small write is made into the 0'd page it gets XBZRLE sent.
1581  */
1582 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1583 {
1584     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1585         return;
1586     }
1587
1588     /* We don't care if this fails to allocate a new cache page
1589      * as long as it updated an old one */
1590     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1591                  ram_counters.dirty_sync_count);
1592 }
1593
1594 #define ENCODING_FLAG_XBZRLE 0x1
1595
1596 /**
1597  * save_xbzrle_page: compress and send current page
1598  *
1599  * Returns: 1 means that we wrote the page
1600  *          0 means that page is identical to the one already sent
1601  *          -1 means that xbzrle would be longer than normal
1602  *
1603  * @rs: current RAM state
1604  * @current_data: pointer to the address of the page contents
1605  * @current_addr: addr of the page
1606  * @block: block that contains the page we want to send
1607  * @offset: offset inside the block for the page
1608  * @last_stage: if we are at the completion stage
1609  */
1610 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1611                             ram_addr_t current_addr, RAMBlock *block,
1612                             ram_addr_t offset, bool last_stage)
1613 {
1614     int encoded_len = 0, bytes_xbzrle;
1615     uint8_t *prev_cached_page;
1616
1617     if (!cache_is_cached(XBZRLE.cache, current_addr,
1618                          ram_counters.dirty_sync_count)) {
1619         xbzrle_counters.cache_miss++;
1620         if (!last_stage) {
1621             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1622                              ram_counters.dirty_sync_count) == -1) {
1623                 return -1;
1624             } else {
1625                 /* update *current_data when the page has been
1626                    inserted into cache */
1627                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1628             }
1629         }
1630         return -1;
1631     }
1632
1633     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1634
1635     /* save current buffer into memory */
1636     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1637
1638     /* XBZRLE encoding (if there is no overflow) */
1639     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1640                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1641                                        TARGET_PAGE_SIZE);
1642
1643     /*
1644      * Update the cache contents, so that it corresponds to the data
1645      * sent, in all cases except where we skip the page.
1646      */
1647     if (!last_stage && encoded_len != 0) {
1648         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1649         /*
1650          * In the case where we couldn't compress, ensure that the caller
1651          * sends the data from the cache, since the guest might have
1652          * changed the RAM since we copied it.
1653          */
1654         *current_data = prev_cached_page;
1655     }
1656
1657     if (encoded_len == 0) {
1658         trace_save_xbzrle_page_skipping();
1659         return 0;
1660     } else if (encoded_len == -1) {
1661         trace_save_xbzrle_page_overflow();
1662         xbzrle_counters.overflow++;
1663         return -1;
1664     }
1665
1666     /* Send XBZRLE based compressed page */
1667     bytes_xbzrle = save_page_header(rs, rs->f, block,
1668                                     offset | RAM_SAVE_FLAG_XBZRLE);
1669     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1670     qemu_put_be16(rs->f, encoded_len);
1671     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1672     bytes_xbzrle += encoded_len + 1 + 2;
1673     xbzrle_counters.pages++;
1674     xbzrle_counters.bytes += bytes_xbzrle;
1675     ram_counters.transferred += bytes_xbzrle;
1676
1677     return 1;
1678 }
1679
1680 /**
1681  * migration_bitmap_find_dirty: find the next dirty page from start
1682  *
1683  * Returns the page offset within memory region of the start of a dirty page
1684  *
1685  * @rs: current RAM state
1686  * @rb: RAMBlock where to search for dirty pages
1687  * @start: page where we start the search
1688  */
1689 static inline
1690 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1691                                           unsigned long start)
1692 {
1693     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1694     unsigned long *bitmap = rb->bmap;
1695     unsigned long next;
1696
1697     if (ramblock_is_ignored(rb)) {
1698         return size;
1699     }
1700
1701     /*
1702      * When the free page optimization is enabled, we need to check the bitmap
1703      * to send the non-free pages rather than all the pages in the bulk stage.
1704      */
1705     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1706         next = start + 1;
1707     } else {
1708         next = find_next_bit(bitmap, size, start);
1709     }
1710
1711     return next;
1712 }
1713
1714 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1715                                                 RAMBlock *rb,
1716                                                 unsigned long page)
1717 {
1718     bool ret;
1719
1720     qemu_mutex_lock(&rs->bitmap_mutex);
1721
1722     /*
1723      * Clear dirty bitmap if needed.  This _must_ be called before we
1724      * send any of the page in the chunk because we need to make sure
1725      * we can capture further page content changes when we sync dirty
1726      * log the next time.  So as long as we are going to send any of
1727      * the page in the chunk we clear the remote dirty bitmap for all.
1728      * Clearing it earlier won't be a problem, but too late will.
1729      */
1730     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1731         uint8_t shift = rb->clear_bmap_shift;
1732         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1733         hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1734
1735         /*
1736          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1737          * can make things easier sometimes since then start address
1738          * of the small chunk will always be 64 pages aligned so the
1739          * bitmap will always be aligned to unsigned long.  We should
1740          * even be able to remove this restriction but I'm simply
1741          * keeping it.
1742          */
1743         assert(shift >= 6);
1744         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1745         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1746     }
1747
1748     ret = test_and_clear_bit(page, rb->bmap);
1749
1750     if (ret) {
1751         rs->migration_dirty_pages--;
1752     }
1753     qemu_mutex_unlock(&rs->bitmap_mutex);
1754
1755     return ret;
1756 }
1757
1758 /* Called with RCU critical section */
1759 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1760 {
1761     rs->migration_dirty_pages +=
1762         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
1763                                               &rs->num_dirty_pages_period);
1764 }
1765
1766 /**
1767  * ram_pagesize_summary: calculate all the pagesizes of a VM
1768  *
1769  * Returns a summary bitmap of the page sizes of all RAMBlocks
1770  *
1771  * For VMs with just normal pages this is equivalent to the host page
1772  * size. If it's got some huge pages then it's the OR of all the
1773  * different page sizes.
1774  */
1775 uint64_t ram_pagesize_summary(void)
1776 {
1777     RAMBlock *block;
1778     uint64_t summary = 0;
1779
1780     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1781         summary |= block->page_size;
1782     }
1783
1784     return summary;
1785 }
1786
1787 uint64_t ram_get_total_transferred_pages(void)
1788 {
1789     return  ram_counters.normal + ram_counters.duplicate +
1790                 compression_counters.pages + xbzrle_counters.pages;
1791 }
1792
1793 static void migration_update_rates(RAMState *rs, int64_t end_time)
1794 {
1795     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1796     double compressed_size;
1797
1798     /* calculate period counters */
1799     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1800                 / (end_time - rs->time_last_bitmap_sync);
1801
1802     if (!page_count) {
1803         return;
1804     }
1805
1806     if (migrate_use_xbzrle()) {
1807         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1808             rs->xbzrle_cache_miss_prev) / page_count;
1809         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1810     }
1811
1812     if (migrate_use_compression()) {
1813         compression_counters.busy_rate = (double)(compression_counters.busy -
1814             rs->compress_thread_busy_prev) / page_count;
1815         rs->compress_thread_busy_prev = compression_counters.busy;
1816
1817         compressed_size = compression_counters.compressed_size -
1818                           rs->compressed_size_prev;
1819         if (compressed_size) {
1820             double uncompressed_size = (compression_counters.pages -
1821                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1822
1823             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1824             compression_counters.compression_rate =
1825                                         uncompressed_size / compressed_size;
1826
1827             rs->compress_pages_prev = compression_counters.pages;
1828             rs->compressed_size_prev = compression_counters.compressed_size;
1829         }
1830     }
1831 }
1832
1833 static void migration_bitmap_sync(RAMState *rs)
1834 {
1835     RAMBlock *block;
1836     int64_t end_time;
1837     uint64_t bytes_xfer_now;
1838
1839     ram_counters.dirty_sync_count++;
1840
1841     if (!rs->time_last_bitmap_sync) {
1842         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1843     }
1844
1845     trace_migration_bitmap_sync_start();
1846     memory_global_dirty_log_sync();
1847
1848     qemu_mutex_lock(&rs->bitmap_mutex);
1849     rcu_read_lock();
1850     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1851         ramblock_sync_dirty_bitmap(rs, block);
1852     }
1853     ram_counters.remaining = ram_bytes_remaining();
1854     rcu_read_unlock();
1855     qemu_mutex_unlock(&rs->bitmap_mutex);
1856
1857     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1858
1859     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1860
1861     /* more than 1 second = 1000 millisecons */
1862     if (end_time > rs->time_last_bitmap_sync + 1000) {
1863         bytes_xfer_now = ram_counters.transferred;
1864
1865         /* During block migration the auto-converge logic incorrectly detects
1866          * that ram migration makes no progress. Avoid this by disabling the
1867          * throttling logic during the bulk phase of block migration. */
1868         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1869             /* The following detection logic can be refined later. For now:
1870                Check to see if the dirtied bytes is 50% more than the approx.
1871                amount of bytes that just got transferred since the last time we
1872                were in this routine. If that happens twice, start or increase
1873                throttling */
1874
1875             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1876                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1877                 (++rs->dirty_rate_high_cnt >= 2)) {
1878                     trace_migration_throttle();
1879                     rs->dirty_rate_high_cnt = 0;
1880                     mig_throttle_guest_down();
1881             }
1882         }
1883
1884         migration_update_rates(rs, end_time);
1885
1886         rs->target_page_count_prev = rs->target_page_count;
1887
1888         /* reset period counters */
1889         rs->time_last_bitmap_sync = end_time;
1890         rs->num_dirty_pages_period = 0;
1891         rs->bytes_xfer_prev = bytes_xfer_now;
1892     }
1893     if (migrate_use_events()) {
1894         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1895     }
1896 }
1897
1898 static void migration_bitmap_sync_precopy(RAMState *rs)
1899 {
1900     Error *local_err = NULL;
1901
1902     /*
1903      * The current notifier usage is just an optimization to migration, so we
1904      * don't stop the normal migration process in the error case.
1905      */
1906     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1907         error_report_err(local_err);
1908     }
1909
1910     migration_bitmap_sync(rs);
1911
1912     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1913         error_report_err(local_err);
1914     }
1915 }
1916
1917 /**
1918  * save_zero_page_to_file: send the zero page to the file
1919  *
1920  * Returns the size of data written to the file, 0 means the page is not
1921  * a zero page
1922  *
1923  * @rs: current RAM state
1924  * @file: the file where the data is saved
1925  * @block: block that contains the page we want to send
1926  * @offset: offset inside the block for the page
1927  */
1928 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1929                                   RAMBlock *block, ram_addr_t offset)
1930 {
1931     uint8_t *p = block->host + offset;
1932     int len = 0;
1933
1934     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1935         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1936         qemu_put_byte(file, 0);
1937         len += 1;
1938     }
1939     return len;
1940 }
1941
1942 /**
1943  * save_zero_page: send the zero page to the stream
1944  *
1945  * Returns the number of pages written.
1946  *
1947  * @rs: current RAM state
1948  * @block: block that contains the page we want to send
1949  * @offset: offset inside the block for the page
1950  */
1951 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1952 {
1953     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1954
1955     if (len) {
1956         ram_counters.duplicate++;
1957         ram_counters.transferred += len;
1958         return 1;
1959     }
1960     return -1;
1961 }
1962
1963 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1964 {
1965     if (!migrate_release_ram() || !migration_in_postcopy()) {
1966         return;
1967     }
1968
1969     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1970 }
1971
1972 /*
1973  * @pages: the number of pages written by the control path,
1974  *        < 0 - error
1975  *        > 0 - number of pages written
1976  *
1977  * Return true if the pages has been saved, otherwise false is returned.
1978  */
1979 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1980                               int *pages)
1981 {
1982     uint64_t bytes_xmit = 0;
1983     int ret;
1984
1985     *pages = -1;
1986     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1987                                 &bytes_xmit);
1988     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1989         return false;
1990     }
1991
1992     if (bytes_xmit) {
1993         ram_counters.transferred += bytes_xmit;
1994         *pages = 1;
1995     }
1996
1997     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1998         return true;
1999     }
2000
2001     if (bytes_xmit > 0) {
2002         ram_counters.normal++;
2003     } else if (bytes_xmit == 0) {
2004         ram_counters.duplicate++;
2005     }
2006
2007     return true;
2008 }
2009
2010 /*
2011  * directly send the page to the stream
2012  *
2013  * Returns the number of pages written.
2014  *
2015  * @rs: current RAM state
2016  * @block: block that contains the page we want to send
2017  * @offset: offset inside the block for the page
2018  * @buf: the page to be sent
2019  * @async: send to page asyncly
2020  */
2021 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2022                             uint8_t *buf, bool async)
2023 {
2024     ram_counters.transferred += save_page_header(rs, rs->f, block,
2025                                                  offset | RAM_SAVE_FLAG_PAGE);
2026     if (async) {
2027         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2028                               migrate_release_ram() &
2029                               migration_in_postcopy());
2030     } else {
2031         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2032     }
2033     ram_counters.transferred += TARGET_PAGE_SIZE;
2034     ram_counters.normal++;
2035     return 1;
2036 }
2037
2038 /**
2039  * ram_save_page: send the given page to the stream
2040  *
2041  * Returns the number of pages written.
2042  *          < 0 - error
2043  *          >=0 - Number of pages written - this might legally be 0
2044  *                if xbzrle noticed the page was the same.
2045  *
2046  * @rs: current RAM state
2047  * @block: block that contains the page we want to send
2048  * @offset: offset inside the block for the page
2049  * @last_stage: if we are at the completion stage
2050  */
2051 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
2052 {
2053     int pages = -1;
2054     uint8_t *p;
2055     bool send_async = true;
2056     RAMBlock *block = pss->block;
2057     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2058     ram_addr_t current_addr = block->offset + offset;
2059
2060     p = block->host + offset;
2061     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2062
2063     XBZRLE_cache_lock();
2064     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2065         migrate_use_xbzrle()) {
2066         pages = save_xbzrle_page(rs, &p, current_addr, block,
2067                                  offset, last_stage);
2068         if (!last_stage) {
2069             /* Can't send this cached data async, since the cache page
2070              * might get updated before it gets to the wire
2071              */
2072             send_async = false;
2073         }
2074     }
2075
2076     /* XBZRLE overflow or normal page */
2077     if (pages == -1) {
2078         pages = save_normal_page(rs, block, offset, p, send_async);
2079     }
2080
2081     XBZRLE_cache_unlock();
2082
2083     return pages;
2084 }
2085
2086 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2087                                  ram_addr_t offset)
2088 {
2089     if (multifd_queue_page(rs, block, offset) < 0) {
2090         return -1;
2091     }
2092     ram_counters.normal++;
2093
2094     return 1;
2095 }
2096
2097 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2098                                  ram_addr_t offset, uint8_t *source_buf)
2099 {
2100     RAMState *rs = ram_state;
2101     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2102     bool zero_page = false;
2103     int ret;
2104
2105     if (save_zero_page_to_file(rs, f, block, offset)) {
2106         zero_page = true;
2107         goto exit;
2108     }
2109
2110     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2111
2112     /*
2113      * copy it to a internal buffer to avoid it being modified by VM
2114      * so that we can catch up the error during compression and
2115      * decompression
2116      */
2117     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2118     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2119     if (ret < 0) {
2120         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2121         error_report("compressed data failed!");
2122         return false;
2123     }
2124
2125 exit:
2126     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2127     return zero_page;
2128 }
2129
2130 static void
2131 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2132 {
2133     ram_counters.transferred += bytes_xmit;
2134
2135     if (param->zero_page) {
2136         ram_counters.duplicate++;
2137         return;
2138     }
2139
2140     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2141     compression_counters.compressed_size += bytes_xmit - 8;
2142     compression_counters.pages++;
2143 }
2144
2145 static bool save_page_use_compression(RAMState *rs);
2146
2147 static void flush_compressed_data(RAMState *rs)
2148 {
2149     int idx, len, thread_count;
2150
2151     if (!save_page_use_compression(rs)) {
2152         return;
2153     }
2154     thread_count = migrate_compress_threads();
2155
2156     qemu_mutex_lock(&comp_done_lock);
2157     for (idx = 0; idx < thread_count; idx++) {
2158         while (!comp_param[idx].done) {
2159             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2160         }
2161     }
2162     qemu_mutex_unlock(&comp_done_lock);
2163
2164     for (idx = 0; idx < thread_count; idx++) {
2165         qemu_mutex_lock(&comp_param[idx].mutex);
2166         if (!comp_param[idx].quit) {
2167             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2168             /*
2169              * it's safe to fetch zero_page without holding comp_done_lock
2170              * as there is no further request submitted to the thread,
2171              * i.e, the thread should be waiting for a request at this point.
2172              */
2173             update_compress_thread_counts(&comp_param[idx], len);
2174         }
2175         qemu_mutex_unlock(&comp_param[idx].mutex);
2176     }
2177 }
2178
2179 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2180                                        ram_addr_t offset)
2181 {
2182     param->block = block;
2183     param->offset = offset;
2184 }
2185
2186 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2187                                            ram_addr_t offset)
2188 {
2189     int idx, thread_count, bytes_xmit = -1, pages = -1;
2190     bool wait = migrate_compress_wait_thread();
2191
2192     thread_count = migrate_compress_threads();
2193     qemu_mutex_lock(&comp_done_lock);
2194 retry:
2195     for (idx = 0; idx < thread_count; idx++) {
2196         if (comp_param[idx].done) {
2197             comp_param[idx].done = false;
2198             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2199             qemu_mutex_lock(&comp_param[idx].mutex);
2200             set_compress_params(&comp_param[idx], block, offset);
2201             qemu_cond_signal(&comp_param[idx].cond);
2202             qemu_mutex_unlock(&comp_param[idx].mutex);
2203             pages = 1;
2204             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2205             break;
2206         }
2207     }
2208
2209     /*
2210      * wait for the free thread if the user specifies 'compress-wait-thread',
2211      * otherwise we will post the page out in the main thread as normal page.
2212      */
2213     if (pages < 0 && wait) {
2214         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2215         goto retry;
2216     }
2217     qemu_mutex_unlock(&comp_done_lock);
2218
2219     return pages;
2220 }
2221
2222 /**
2223  * find_dirty_block: find the next dirty page and update any state
2224  * associated with the search process.
2225  *
2226  * Returns true if a page is found
2227  *
2228  * @rs: current RAM state
2229  * @pss: data about the state of the current dirty page scan
2230  * @again: set to false if the search has scanned the whole of RAM
2231  */
2232 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2233 {
2234     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2235     if (pss->complete_round && pss->block == rs->last_seen_block &&
2236         pss->page >= rs->last_page) {
2237         /*
2238          * We've been once around the RAM and haven't found anything.
2239          * Give up.
2240          */
2241         *again = false;
2242         return false;
2243     }
2244     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2245         /* Didn't find anything in this RAM Block */
2246         pss->page = 0;
2247         pss->block = QLIST_NEXT_RCU(pss->block, next);
2248         if (!pss->block) {
2249             /*
2250              * If memory migration starts over, we will meet a dirtied page
2251              * which may still exists in compression threads's ring, so we
2252              * should flush the compressed data to make sure the new page
2253              * is not overwritten by the old one in the destination.
2254              *
2255              * Also If xbzrle is on, stop using the data compression at this
2256              * point. In theory, xbzrle can do better than compression.
2257              */
2258             flush_compressed_data(rs);
2259
2260             /* Hit the end of the list */
2261             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2262             /* Flag that we've looped */
2263             pss->complete_round = true;
2264             rs->ram_bulk_stage = false;
2265         }
2266         /* Didn't find anything this time, but try again on the new block */
2267         *again = true;
2268         return false;
2269     } else {
2270         /* Can go around again, but... */
2271         *again = true;
2272         /* We've found something so probably don't need to */
2273         return true;
2274     }
2275 }
2276
2277 /**
2278  * unqueue_page: gets a page of the queue
2279  *
2280  * Helper for 'get_queued_page' - gets a page off the queue
2281  *
2282  * Returns the block of the page (or NULL if none available)
2283  *
2284  * @rs: current RAM state
2285  * @offset: used to return the offset within the RAMBlock
2286  */
2287 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2288 {
2289     RAMBlock *block = NULL;
2290
2291     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2292         return NULL;
2293     }
2294
2295     qemu_mutex_lock(&rs->src_page_req_mutex);
2296     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2297         struct RAMSrcPageRequest *entry =
2298                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2299         block = entry->rb;
2300         *offset = entry->offset;
2301
2302         if (entry->len > TARGET_PAGE_SIZE) {
2303             entry->len -= TARGET_PAGE_SIZE;
2304             entry->offset += TARGET_PAGE_SIZE;
2305         } else {
2306             memory_region_unref(block->mr);
2307             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2308             g_free(entry);
2309             migration_consume_urgent_request();
2310         }
2311     }
2312     qemu_mutex_unlock(&rs->src_page_req_mutex);
2313
2314     return block;
2315 }
2316
2317 /**
2318  * get_queued_page: unqueue a page from the postcopy requests
2319  *
2320  * Skips pages that are already sent (!dirty)
2321  *
2322  * Returns true if a queued page is found
2323  *
2324  * @rs: current RAM state
2325  * @pss: data about the state of the current dirty page scan
2326  */
2327 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2328 {
2329     RAMBlock  *block;
2330     ram_addr_t offset;
2331     bool dirty;
2332
2333     do {
2334         block = unqueue_page(rs, &offset);
2335         /*
2336          * We're sending this page, and since it's postcopy nothing else
2337          * will dirty it, and we must make sure it doesn't get sent again
2338          * even if this queue request was received after the background
2339          * search already sent it.
2340          */
2341         if (block) {
2342             unsigned long page;
2343
2344             page = offset >> TARGET_PAGE_BITS;
2345             dirty = test_bit(page, block->bmap);
2346             if (!dirty) {
2347                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2348                        page, test_bit(page, block->unsentmap));
2349             } else {
2350                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2351             }
2352         }
2353
2354     } while (block && !dirty);
2355
2356     if (block) {
2357         /*
2358          * As soon as we start servicing pages out of order, then we have
2359          * to kill the bulk stage, since the bulk stage assumes
2360          * in (migration_bitmap_find_and_reset_dirty) that every page is
2361          * dirty, that's no longer true.
2362          */
2363         rs->ram_bulk_stage = false;
2364
2365         /*
2366          * We want the background search to continue from the queued page
2367          * since the guest is likely to want other pages near to the page
2368          * it just requested.
2369          */
2370         pss->block = block;
2371         pss->page = offset >> TARGET_PAGE_BITS;
2372
2373         /*
2374          * This unqueued page would break the "one round" check, even is
2375          * really rare.
2376          */
2377         pss->complete_round = false;
2378     }
2379
2380     return !!block;
2381 }
2382
2383 /**
2384  * migration_page_queue_free: drop any remaining pages in the ram
2385  * request queue
2386  *
2387  * It should be empty at the end anyway, but in error cases there may
2388  * be some left.  in case that there is any page left, we drop it.
2389  *
2390  */
2391 static void migration_page_queue_free(RAMState *rs)
2392 {
2393     struct RAMSrcPageRequest *mspr, *next_mspr;
2394     /* This queue generally should be empty - but in the case of a failed
2395      * migration might have some droppings in.
2396      */
2397     rcu_read_lock();
2398     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2399         memory_region_unref(mspr->rb->mr);
2400         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2401         g_free(mspr);
2402     }
2403     rcu_read_unlock();
2404 }
2405
2406 /**
2407  * ram_save_queue_pages: queue the page for transmission
2408  *
2409  * A request from postcopy destination for example.
2410  *
2411  * Returns zero on success or negative on error
2412  *
2413  * @rbname: Name of the RAMBLock of the request. NULL means the
2414  *          same that last one.
2415  * @start: starting address from the start of the RAMBlock
2416  * @len: length (in bytes) to send
2417  */
2418 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2419 {
2420     RAMBlock *ramblock;
2421     RAMState *rs = ram_state;
2422
2423     ram_counters.postcopy_requests++;
2424     rcu_read_lock();
2425     if (!rbname) {
2426         /* Reuse last RAMBlock */
2427         ramblock = rs->last_req_rb;
2428
2429         if (!ramblock) {
2430             /*
2431              * Shouldn't happen, we can't reuse the last RAMBlock if
2432              * it's the 1st request.
2433              */
2434             error_report("ram_save_queue_pages no previous block");
2435             goto err;
2436         }
2437     } else {
2438         ramblock = qemu_ram_block_by_name(rbname);
2439
2440         if (!ramblock) {
2441             /* We shouldn't be asked for a non-existent RAMBlock */
2442             error_report("ram_save_queue_pages no block '%s'", rbname);
2443             goto err;
2444         }
2445         rs->last_req_rb = ramblock;
2446     }
2447     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2448     if (start+len > ramblock->used_length) {
2449         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2450                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2451                      __func__, start, len, ramblock->used_length);
2452         goto err;
2453     }
2454
2455     struct RAMSrcPageRequest *new_entry =
2456         g_malloc0(sizeof(struct RAMSrcPageRequest));
2457     new_entry->rb = ramblock;
2458     new_entry->offset = start;
2459     new_entry->len = len;
2460
2461     memory_region_ref(ramblock->mr);
2462     qemu_mutex_lock(&rs->src_page_req_mutex);
2463     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2464     migration_make_urgent_request();
2465     qemu_mutex_unlock(&rs->src_page_req_mutex);
2466     rcu_read_unlock();
2467
2468     return 0;
2469
2470 err:
2471     rcu_read_unlock();
2472     return -1;
2473 }
2474
2475 static bool save_page_use_compression(RAMState *rs)
2476 {
2477     if (!migrate_use_compression()) {
2478         return false;
2479     }
2480
2481     /*
2482      * If xbzrle is on, stop using the data compression after first
2483      * round of migration even if compression is enabled. In theory,
2484      * xbzrle can do better than compression.
2485      */
2486     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2487         return true;
2488     }
2489
2490     return false;
2491 }
2492
2493 /*
2494  * try to compress the page before posting it out, return true if the page
2495  * has been properly handled by compression, otherwise needs other
2496  * paths to handle it
2497  */
2498 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2499 {
2500     if (!save_page_use_compression(rs)) {
2501         return false;
2502     }
2503
2504     /*
2505      * When starting the process of a new block, the first page of
2506      * the block should be sent out before other pages in the same
2507      * block, and all the pages in last block should have been sent
2508      * out, keeping this order is important, because the 'cont' flag
2509      * is used to avoid resending the block name.
2510      *
2511      * We post the fist page as normal page as compression will take
2512      * much CPU resource.
2513      */
2514     if (block != rs->last_sent_block) {
2515         flush_compressed_data(rs);
2516         return false;
2517     }
2518
2519     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2520         return true;
2521     }
2522
2523     compression_counters.busy++;
2524     return false;
2525 }
2526
2527 /**
2528  * ram_save_target_page: save one target page
2529  *
2530  * Returns the number of pages written
2531  *
2532  * @rs: current RAM state
2533  * @pss: data about the page we want to send
2534  * @last_stage: if we are at the completion stage
2535  */
2536 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2537                                 bool last_stage)
2538 {
2539     RAMBlock *block = pss->block;
2540     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2541     int res;
2542
2543     if (control_save_page(rs, block, offset, &res)) {
2544         return res;
2545     }
2546
2547     if (save_compress_page(rs, block, offset)) {
2548         return 1;
2549     }
2550
2551     res = save_zero_page(rs, block, offset);
2552     if (res > 0) {
2553         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2554          * page would be stale
2555          */
2556         if (!save_page_use_compression(rs)) {
2557             XBZRLE_cache_lock();
2558             xbzrle_cache_zero_page(rs, block->offset + offset);
2559             XBZRLE_cache_unlock();
2560         }
2561         ram_release_pages(block->idstr, offset, res);
2562         return res;
2563     }
2564
2565     /*
2566      * do not use multifd for compression as the first page in the new
2567      * block should be posted out before sending the compressed page
2568      */
2569     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2570         return ram_save_multifd_page(rs, block, offset);
2571     }
2572
2573     return ram_save_page(rs, pss, last_stage);
2574 }
2575
2576 /**
2577  * ram_save_host_page: save a whole host page
2578  *
2579  * Starting at *offset send pages up to the end of the current host
2580  * page. It's valid for the initial offset to point into the middle of
2581  * a host page in which case the remainder of the hostpage is sent.
2582  * Only dirty target pages are sent. Note that the host page size may
2583  * be a huge page for this block.
2584  * The saving stops at the boundary of the used_length of the block
2585  * if the RAMBlock isn't a multiple of the host page size.
2586  *
2587  * Returns the number of pages written or negative on error
2588  *
2589  * @rs: current RAM state
2590  * @ms: current migration state
2591  * @pss: data about the page we want to send
2592  * @last_stage: if we are at the completion stage
2593  */
2594 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2595                               bool last_stage)
2596 {
2597     int tmppages, pages = 0;
2598     size_t pagesize_bits =
2599         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2600
2601     if (ramblock_is_ignored(pss->block)) {
2602         error_report("block %s should not be migrated !", pss->block->idstr);
2603         return 0;
2604     }
2605
2606     do {
2607         /* Check the pages is dirty and if it is send it */
2608         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2609             pss->page++;
2610             continue;
2611         }
2612
2613         tmppages = ram_save_target_page(rs, pss, last_stage);
2614         if (tmppages < 0) {
2615             return tmppages;
2616         }
2617
2618         pages += tmppages;
2619         if (pss->block->unsentmap) {
2620             clear_bit(pss->page, pss->block->unsentmap);
2621         }
2622
2623         pss->page++;
2624     } while ((pss->page & (pagesize_bits - 1)) &&
2625              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2626
2627     /* The offset we leave with is the last one we looked at */
2628     pss->page--;
2629     return pages;
2630 }
2631
2632 /**
2633  * ram_find_and_save_block: finds a dirty page and sends it to f
2634  *
2635  * Called within an RCU critical section.
2636  *
2637  * Returns the number of pages written where zero means no dirty pages,
2638  * or negative on error
2639  *
2640  * @rs: current RAM state
2641  * @last_stage: if we are at the completion stage
2642  *
2643  * On systems where host-page-size > target-page-size it will send all the
2644  * pages in a host page that are dirty.
2645  */
2646
2647 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2648 {
2649     PageSearchStatus pss;
2650     int pages = 0;
2651     bool again, found;
2652
2653     /* No dirty page as there is zero RAM */
2654     if (!ram_bytes_total()) {
2655         return pages;
2656     }
2657
2658     pss.block = rs->last_seen_block;
2659     pss.page = rs->last_page;
2660     pss.complete_round = false;
2661
2662     if (!pss.block) {
2663         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2664     }
2665
2666     do {
2667         again = true;
2668         found = get_queued_page(rs, &pss);
2669
2670         if (!found) {
2671             /* priority queue empty, so just search for something dirty */
2672             found = find_dirty_block(rs, &pss, &again);
2673         }
2674
2675         if (found) {
2676             pages = ram_save_host_page(rs, &pss, last_stage);
2677         }
2678     } while (!pages && again);
2679
2680     rs->last_seen_block = pss.block;
2681     rs->last_page = pss.page;
2682
2683     return pages;
2684 }
2685
2686 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2687 {
2688     uint64_t pages = size / TARGET_PAGE_SIZE;
2689
2690     if (zero) {
2691         ram_counters.duplicate += pages;
2692     } else {
2693         ram_counters.normal += pages;
2694         ram_counters.transferred += size;
2695         qemu_update_position(f, size);
2696     }
2697 }
2698
2699 static uint64_t ram_bytes_total_common(bool count_ignored)
2700 {
2701     RAMBlock *block;
2702     uint64_t total = 0;
2703
2704     rcu_read_lock();
2705     if (count_ignored) {
2706         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2707             total += block->used_length;
2708         }
2709     } else {
2710         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2711             total += block->used_length;
2712         }
2713     }
2714     rcu_read_unlock();
2715     return total;
2716 }
2717
2718 uint64_t ram_bytes_total(void)
2719 {
2720     return ram_bytes_total_common(false);
2721 }
2722
2723 static void xbzrle_load_setup(void)
2724 {
2725     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2726 }
2727
2728 static void xbzrle_load_cleanup(void)
2729 {
2730     g_free(XBZRLE.decoded_buf);
2731     XBZRLE.decoded_buf = NULL;
2732 }
2733
2734 static void ram_state_cleanup(RAMState **rsp)
2735 {
2736     if (*rsp) {
2737         migration_page_queue_free(*rsp);
2738         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2739         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2740         g_free(*rsp);
2741         *rsp = NULL;
2742     }
2743 }
2744
2745 static void xbzrle_cleanup(void)
2746 {
2747     XBZRLE_cache_lock();
2748     if (XBZRLE.cache) {
2749         cache_fini(XBZRLE.cache);
2750         g_free(XBZRLE.encoded_buf);
2751         g_free(XBZRLE.current_buf);
2752         g_free(XBZRLE.zero_target_page);
2753         XBZRLE.cache = NULL;
2754         XBZRLE.encoded_buf = NULL;
2755         XBZRLE.current_buf = NULL;
2756         XBZRLE.zero_target_page = NULL;
2757     }
2758     XBZRLE_cache_unlock();
2759 }
2760
2761 static void ram_save_cleanup(void *opaque)
2762 {
2763     RAMState **rsp = opaque;
2764     RAMBlock *block;
2765
2766     /* caller have hold iothread lock or is in a bh, so there is
2767      * no writing race against the migration bitmap
2768      */
2769     memory_global_dirty_log_stop();
2770
2771     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2772         g_free(block->clear_bmap);
2773         block->clear_bmap = NULL;
2774         g_free(block->bmap);
2775         block->bmap = NULL;
2776         g_free(block->unsentmap);
2777         block->unsentmap = NULL;
2778     }
2779
2780     xbzrle_cleanup();
2781     compress_threads_save_cleanup();
2782     ram_state_cleanup(rsp);
2783 }
2784
2785 static void ram_state_reset(RAMState *rs)
2786 {
2787     rs->last_seen_block = NULL;
2788     rs->last_sent_block = NULL;
2789     rs->last_page = 0;
2790     rs->last_version = ram_list.version;
2791     rs->ram_bulk_stage = true;
2792     rs->fpo_enabled = false;
2793 }
2794
2795 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2796
2797 /*
2798  * 'expected' is the value you expect the bitmap mostly to be full
2799  * of; it won't bother printing lines that are all this value.
2800  * If 'todump' is null the migration bitmap is dumped.
2801  */
2802 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2803                            unsigned long pages)
2804 {
2805     int64_t cur;
2806     int64_t linelen = 128;
2807     char linebuf[129];
2808
2809     for (cur = 0; cur < pages; cur += linelen) {
2810         int64_t curb;
2811         bool found = false;
2812         /*
2813          * Last line; catch the case where the line length
2814          * is longer than remaining ram
2815          */
2816         if (cur + linelen > pages) {
2817             linelen = pages - cur;
2818         }
2819         for (curb = 0; curb < linelen; curb++) {
2820             bool thisbit = test_bit(cur + curb, todump);
2821             linebuf[curb] = thisbit ? '1' : '.';
2822             found = found || (thisbit != expected);
2823         }
2824         if (found) {
2825             linebuf[curb] = '\0';
2826             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2827         }
2828     }
2829 }
2830
2831 /* **** functions for postcopy ***** */
2832
2833 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2834 {
2835     struct RAMBlock *block;
2836
2837     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2838         unsigned long *bitmap = block->bmap;
2839         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2840         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2841
2842         while (run_start < range) {
2843             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2844             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2845                               (run_end - run_start) << TARGET_PAGE_BITS);
2846             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2847         }
2848     }
2849 }
2850
2851 /**
2852  * postcopy_send_discard_bm_ram: discard a RAMBlock
2853  *
2854  * Returns zero on success
2855  *
2856  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2857  * Note: At this point the 'unsentmap' is the processed bitmap combined
2858  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2859  *
2860  * @ms: current migration state
2861  * @block: RAMBlock to discard
2862  */
2863 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2864 {
2865     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2866     unsigned long current;
2867     unsigned long *unsentmap = block->unsentmap;
2868
2869     for (current = 0; current < end; ) {
2870         unsigned long one = find_next_bit(unsentmap, end, current);
2871         unsigned long zero, discard_length;
2872
2873         if (one >= end) {
2874             break;
2875         }
2876
2877         zero = find_next_zero_bit(unsentmap, end, one + 1);
2878
2879         if (zero >= end) {
2880             discard_length = end - one;
2881         } else {
2882             discard_length = zero - one;
2883         }
2884         postcopy_discard_send_range(ms, one, discard_length);
2885         current = one + discard_length;
2886     }
2887
2888     return 0;
2889 }
2890
2891 /**
2892  * postcopy_each_ram_send_discard: discard all RAMBlocks
2893  *
2894  * Returns 0 for success or negative for error
2895  *
2896  * Utility for the outgoing postcopy code.
2897  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2898  *   passing it bitmap indexes and name.
2899  * (qemu_ram_foreach_block ends up passing unscaled lengths
2900  *  which would mean postcopy code would have to deal with target page)
2901  *
2902  * @ms: current migration state
2903  */
2904 static int postcopy_each_ram_send_discard(MigrationState *ms)
2905 {
2906     struct RAMBlock *block;
2907     int ret;
2908
2909     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2910         postcopy_discard_send_init(ms, block->idstr);
2911
2912         /*
2913          * Postcopy sends chunks of bitmap over the wire, but it
2914          * just needs indexes at this point, avoids it having
2915          * target page specific code.
2916          */
2917         ret = postcopy_send_discard_bm_ram(ms, block);
2918         postcopy_discard_send_finish(ms);
2919         if (ret) {
2920             return ret;
2921         }
2922     }
2923
2924     return 0;
2925 }
2926
2927 /**
2928  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2929  *
2930  * Helper for postcopy_chunk_hostpages; it's called twice to
2931  * canonicalize the two bitmaps, that are similar, but one is
2932  * inverted.
2933  *
2934  * Postcopy requires that all target pages in a hostpage are dirty or
2935  * clean, not a mix.  This function canonicalizes the bitmaps.
2936  *
2937  * @ms: current migration state
2938  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2939  *               otherwise we need to canonicalize partially dirty host pages
2940  * @block: block that contains the page we want to canonicalize
2941  */
2942 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2943                                           RAMBlock *block)
2944 {
2945     RAMState *rs = ram_state;
2946     unsigned long *bitmap = block->bmap;
2947     unsigned long *unsentmap = block->unsentmap;
2948     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2949     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2950     unsigned long run_start;
2951
2952     if (block->page_size == TARGET_PAGE_SIZE) {
2953         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2954         return;
2955     }
2956
2957     if (unsent_pass) {
2958         /* Find a sent page */
2959         run_start = find_next_zero_bit(unsentmap, pages, 0);
2960     } else {
2961         /* Find a dirty page */
2962         run_start = find_next_bit(bitmap, pages, 0);
2963     }
2964
2965     while (run_start < pages) {
2966
2967         /*
2968          * If the start of this run of pages is in the middle of a host
2969          * page, then we need to fixup this host page.
2970          */
2971         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2972             /* Find the end of this run */
2973             if (unsent_pass) {
2974                 run_start = find_next_bit(unsentmap, pages, run_start + 1);
2975             } else {
2976                 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2977             }
2978             /*
2979              * If the end isn't at the start of a host page, then the
2980              * run doesn't finish at the end of a host page
2981              * and we need to discard.
2982              */
2983         }
2984
2985         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2986             unsigned long page;
2987             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2988                                                              host_ratio);
2989             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2990
2991             /* Tell the destination to discard this page */
2992             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2993                 /* For the unsent_pass we:
2994                  *     discard partially sent pages
2995                  * For the !unsent_pass (dirty) we:
2996                  *     discard partially dirty pages that were sent
2997                  *     (any partially sent pages were already discarded
2998                  *     by the previous unsent_pass)
2999                  */
3000                 postcopy_discard_send_range(ms, fixup_start_addr, host_ratio);
3001             }
3002
3003             /* Clean up the bitmap */
3004             for (page = fixup_start_addr;
3005                  page < fixup_start_addr + host_ratio; page++) {
3006                 /* All pages in this host page are now not sent */
3007                 set_bit(page, unsentmap);
3008
3009                 /*
3010                  * Remark them as dirty, updating the count for any pages
3011                  * that weren't previously dirty.
3012                  */
3013                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
3014             }
3015         }
3016
3017         if (unsent_pass) {
3018             /* Find the next sent page for the next iteration */
3019             run_start = find_next_zero_bit(unsentmap, pages, run_start);
3020         } else {
3021             /* Find the next dirty page for the next iteration */
3022             run_start = find_next_bit(bitmap, pages, run_start);
3023         }
3024     }
3025 }
3026
3027 /**
3028  * postcopy_chunk_hostpages: discard any partially sent host page
3029  *
3030  * Utility for the outgoing postcopy code.
3031  *
3032  * Discard any partially sent host-page size chunks, mark any partially
3033  * dirty host-page size chunks as all dirty.  In this case the host-page
3034  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
3035  *
3036  * Returns zero on success
3037  *
3038  * @ms: current migration state
3039  * @block: block we want to work with
3040  */
3041 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
3042 {
3043     postcopy_discard_send_init(ms, block->idstr);
3044
3045     /* First pass: Discard all partially sent host pages */
3046     postcopy_chunk_hostpages_pass(ms, true, block);
3047     /*
3048      * Second pass: Ensure that all partially dirty host pages are made
3049      * fully dirty.
3050      */
3051     postcopy_chunk_hostpages_pass(ms, false, block);
3052
3053     postcopy_discard_send_finish(ms);
3054     return 0;
3055 }
3056
3057 /**
3058  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3059  *
3060  * Returns zero on success
3061  *
3062  * Transmit the set of pages to be discarded after precopy to the target
3063  * these are pages that:
3064  *     a) Have been previously transmitted but are now dirty again
3065  *     b) Pages that have never been transmitted, this ensures that
3066  *        any pages on the destination that have been mapped by background
3067  *        tasks get discarded (transparent huge pages is the specific concern)
3068  * Hopefully this is pretty sparse
3069  *
3070  * @ms: current migration state
3071  */
3072 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3073 {
3074     RAMState *rs = ram_state;
3075     RAMBlock *block;
3076     int ret;
3077
3078     rcu_read_lock();
3079
3080     /* This should be our last sync, the src is now paused */
3081     migration_bitmap_sync(rs);
3082
3083     /* Easiest way to make sure we don't resume in the middle of a host-page */
3084     rs->last_seen_block = NULL;
3085     rs->last_sent_block = NULL;
3086     rs->last_page = 0;
3087
3088     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3089         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3090         unsigned long *bitmap = block->bmap;
3091         unsigned long *unsentmap = block->unsentmap;
3092
3093         if (!unsentmap) {
3094             /* We don't have a safe way to resize the sentmap, so
3095              * if the bitmap was resized it will be NULL at this
3096              * point.
3097              */
3098             error_report("migration ram resized during precopy phase");
3099             rcu_read_unlock();
3100             return -EINVAL;
3101         }
3102         /* Deal with TPS != HPS and huge pages */
3103         ret = postcopy_chunk_hostpages(ms, block);
3104         if (ret) {
3105             rcu_read_unlock();
3106             return ret;
3107         }
3108
3109         /*
3110          * Update the unsentmap to be unsentmap = unsentmap | dirty
3111          */
3112         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3113 #ifdef DEBUG_POSTCOPY
3114         ram_debug_dump_bitmap(unsentmap, true, pages);
3115 #endif
3116     }
3117     trace_ram_postcopy_send_discard_bitmap();
3118
3119     ret = postcopy_each_ram_send_discard(ms);
3120     rcu_read_unlock();
3121
3122     return ret;
3123 }
3124
3125 /**
3126  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3127  *
3128  * Returns zero on success
3129  *
3130  * @rbname: name of the RAMBlock of the request. NULL means the
3131  *          same that last one.
3132  * @start: RAMBlock starting page
3133  * @length: RAMBlock size
3134  */
3135 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3136 {
3137     int ret = -1;
3138
3139     trace_ram_discard_range(rbname, start, length);
3140
3141     rcu_read_lock();
3142     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3143
3144     if (!rb) {
3145         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3146         goto err;
3147     }
3148
3149     /*
3150      * On source VM, we don't need to update the received bitmap since
3151      * we don't even have one.
3152      */
3153     if (rb->receivedmap) {
3154         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3155                      length >> qemu_target_page_bits());
3156     }
3157
3158     ret = ram_block_discard_range(rb, start, length);
3159
3160 err:
3161     rcu_read_unlock();
3162
3163     return ret;
3164 }
3165
3166 /*
3167  * For every allocation, we will try not to crash the VM if the
3168  * allocation failed.
3169  */
3170 static int xbzrle_init(void)
3171 {
3172     Error *local_err = NULL;
3173
3174     if (!migrate_use_xbzrle()) {
3175         return 0;
3176     }
3177
3178     XBZRLE_cache_lock();
3179
3180     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3181     if (!XBZRLE.zero_target_page) {
3182         error_report("%s: Error allocating zero page", __func__);
3183         goto err_out;
3184     }
3185
3186     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3187                               TARGET_PAGE_SIZE, &local_err);
3188     if (!XBZRLE.cache) {
3189         error_report_err(local_err);
3190         goto free_zero_page;
3191     }
3192
3193     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3194     if (!XBZRLE.encoded_buf) {
3195         error_report("%s: Error allocating encoded_buf", __func__);
3196         goto free_cache;
3197     }
3198
3199     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3200     if (!XBZRLE.current_buf) {
3201         error_report("%s: Error allocating current_buf", __func__);
3202         goto free_encoded_buf;
3203     }
3204
3205     /* We are all good */
3206     XBZRLE_cache_unlock();
3207     return 0;
3208
3209 free_encoded_buf:
3210     g_free(XBZRLE.encoded_buf);
3211     XBZRLE.encoded_buf = NULL;
3212 free_cache:
3213     cache_fini(XBZRLE.cache);
3214     XBZRLE.cache = NULL;
3215 free_zero_page:
3216     g_free(XBZRLE.zero_target_page);
3217     XBZRLE.zero_target_page = NULL;
3218 err_out:
3219     XBZRLE_cache_unlock();
3220     return -ENOMEM;
3221 }
3222
3223 static int ram_state_init(RAMState **rsp)
3224 {
3225     *rsp = g_try_new0(RAMState, 1);
3226
3227     if (!*rsp) {
3228         error_report("%s: Init ramstate fail", __func__);
3229         return -1;
3230     }
3231
3232     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3233     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3234     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3235
3236     /*
3237      * Count the total number of pages used by ram blocks not including any
3238      * gaps due to alignment or unplugs.
3239      * This must match with the initial values of dirty bitmap.
3240      */
3241     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3242     ram_state_reset(*rsp);
3243
3244     return 0;
3245 }
3246
3247 static void ram_list_init_bitmaps(void)
3248 {
3249     MigrationState *ms = migrate_get_current();
3250     RAMBlock *block;
3251     unsigned long pages;
3252     uint8_t shift;
3253
3254     /* Skip setting bitmap if there is no RAM */
3255     if (ram_bytes_total()) {
3256         shift = ms->clear_bitmap_shift;
3257         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3258             error_report("clear_bitmap_shift (%u) too big, using "
3259                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3260             shift = CLEAR_BITMAP_SHIFT_MAX;
3261         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3262             error_report("clear_bitmap_shift (%u) too small, using "
3263                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3264             shift = CLEAR_BITMAP_SHIFT_MIN;
3265         }
3266
3267         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3268             pages = block->max_length >> TARGET_PAGE_BITS;
3269             /*
3270              * The initial dirty bitmap for migration must be set with all
3271              * ones to make sure we'll migrate every guest RAM page to
3272              * destination.
3273              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3274              * new migration after a failed migration, ram_list.
3275              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3276              * guest memory.
3277              */
3278             block->bmap = bitmap_new(pages);
3279             bitmap_set(block->bmap, 0, pages);
3280             block->clear_bmap_shift = shift;
3281             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3282             if (migrate_postcopy_ram()) {
3283                 block->unsentmap = bitmap_new(pages);
3284                 bitmap_set(block->unsentmap, 0, pages);
3285             }
3286         }
3287     }
3288 }
3289
3290 static void ram_init_bitmaps(RAMState *rs)
3291 {
3292     /* For memory_global_dirty_log_start below.  */
3293     qemu_mutex_lock_iothread();
3294     qemu_mutex_lock_ramlist();
3295     rcu_read_lock();
3296
3297     ram_list_init_bitmaps();
3298     memory_global_dirty_log_start();
3299     migration_bitmap_sync_precopy(rs);
3300
3301     rcu_read_unlock();
3302     qemu_mutex_unlock_ramlist();
3303     qemu_mutex_unlock_iothread();
3304 }
3305
3306 static int ram_init_all(RAMState **rsp)
3307 {
3308     if (ram_state_init(rsp)) {
3309         return -1;
3310     }
3311
3312     if (xbzrle_init()) {
3313         ram_state_cleanup(rsp);
3314         return -1;
3315     }
3316
3317     ram_init_bitmaps(*rsp);
3318
3319     return 0;
3320 }
3321
3322 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3323 {
3324     RAMBlock *block;
3325     uint64_t pages = 0;
3326
3327     /*
3328      * Postcopy is not using xbzrle/compression, so no need for that.
3329      * Also, since source are already halted, we don't need to care
3330      * about dirty page logging as well.
3331      */
3332
3333     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3334         pages += bitmap_count_one(block->bmap,
3335                                   block->used_length >> TARGET_PAGE_BITS);
3336     }
3337
3338     /* This may not be aligned with current bitmaps. Recalculate. */
3339     rs->migration_dirty_pages = pages;
3340
3341     rs->last_seen_block = NULL;
3342     rs->last_sent_block = NULL;
3343     rs->last_page = 0;
3344     rs->last_version = ram_list.version;
3345     /*
3346      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3347      * matter what we have sent.
3348      */
3349     rs->ram_bulk_stage = false;
3350
3351     /* Update RAMState cache of output QEMUFile */
3352     rs->f = out;
3353
3354     trace_ram_state_resume_prepare(pages);
3355 }
3356
3357 /*
3358  * This function clears bits of the free pages reported by the caller from the
3359  * migration dirty bitmap. @addr is the host address corresponding to the
3360  * start of the continuous guest free pages, and @len is the total bytes of
3361  * those pages.
3362  */
3363 void qemu_guest_free_page_hint(void *addr, size_t len)
3364 {
3365     RAMBlock *block;
3366     ram_addr_t offset;
3367     size_t used_len, start, npages;
3368     MigrationState *s = migrate_get_current();
3369
3370     /* This function is currently expected to be used during live migration */
3371     if (!migration_is_setup_or_active(s->state)) {
3372         return;
3373     }
3374
3375     for (; len > 0; len -= used_len, addr += used_len) {
3376         block = qemu_ram_block_from_host(addr, false, &offset);
3377         if (unlikely(!block || offset >= block->used_length)) {
3378             /*
3379              * The implementation might not support RAMBlock resize during
3380              * live migration, but it could happen in theory with future
3381              * updates. So we add a check here to capture that case.
3382              */
3383             error_report_once("%s unexpected error", __func__);
3384             return;
3385         }
3386
3387         if (len <= block->used_length - offset) {
3388             used_len = len;
3389         } else {
3390             used_len = block->used_length - offset;
3391         }
3392
3393         start = offset >> TARGET_PAGE_BITS;
3394         npages = used_len >> TARGET_PAGE_BITS;
3395
3396         qemu_mutex_lock(&ram_state->bitmap_mutex);
3397         ram_state->migration_dirty_pages -=
3398                       bitmap_count_one_with_offset(block->bmap, start, npages);
3399         bitmap_clear(block->bmap, start, npages);
3400         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3401     }
3402 }
3403
3404 /*
3405  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3406  * long-running RCU critical section.  When rcu-reclaims in the code
3407  * start to become numerous it will be necessary to reduce the
3408  * granularity of these critical sections.
3409  */
3410
3411 /**
3412  * ram_save_setup: Setup RAM for migration
3413  *
3414  * Returns zero to indicate success and negative for error
3415  *
3416  * @f: QEMUFile where to send the data
3417  * @opaque: RAMState pointer
3418  */
3419 static int ram_save_setup(QEMUFile *f, void *opaque)
3420 {
3421     RAMState **rsp = opaque;
3422     RAMBlock *block;
3423
3424     if (compress_threads_save_setup()) {
3425         return -1;
3426     }
3427
3428     /* migration has already setup the bitmap, reuse it. */
3429     if (!migration_in_colo_state()) {
3430         if (ram_init_all(rsp) != 0) {
3431             compress_threads_save_cleanup();
3432             return -1;
3433         }
3434     }
3435     (*rsp)->f = f;
3436
3437     rcu_read_lock();
3438
3439     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3440
3441     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3442         qemu_put_byte(f, strlen(block->idstr));
3443         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3444         qemu_put_be64(f, block->used_length);
3445         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3446             qemu_put_be64(f, block->page_size);
3447         }
3448         if (migrate_ignore_shared()) {
3449             qemu_put_be64(f, block->mr->addr);
3450         }
3451     }
3452
3453     rcu_read_unlock();
3454
3455     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3456     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3457
3458     multifd_send_sync_main(*rsp);
3459     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3460     qemu_fflush(f);
3461
3462     return 0;
3463 }
3464
3465 /**
3466  * ram_save_iterate: iterative stage for migration
3467  *
3468  * Returns zero to indicate success and negative for error
3469  *
3470  * @f: QEMUFile where to send the data
3471  * @opaque: RAMState pointer
3472  */
3473 static int ram_save_iterate(QEMUFile *f, void *opaque)
3474 {
3475     RAMState **temp = opaque;
3476     RAMState *rs = *temp;
3477     int ret;
3478     int i;
3479     int64_t t0;
3480     int done = 0;
3481
3482     if (blk_mig_bulk_active()) {
3483         /* Avoid transferring ram during bulk phase of block migration as
3484          * the bulk phase will usually take a long time and transferring
3485          * ram updates during that time is pointless. */
3486         goto out;
3487     }
3488
3489     rcu_read_lock();
3490     if (ram_list.version != rs->last_version) {
3491         ram_state_reset(rs);
3492     }
3493
3494     /* Read version before ram_list.blocks */
3495     smp_rmb();
3496
3497     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3498
3499     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3500     i = 0;
3501     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3502             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3503         int pages;
3504
3505         if (qemu_file_get_error(f)) {
3506             break;
3507         }
3508
3509         pages = ram_find_and_save_block(rs, false);
3510         /* no more pages to sent */
3511         if (pages == 0) {
3512             done = 1;
3513             break;
3514         }
3515
3516         if (pages < 0) {
3517             qemu_file_set_error(f, pages);
3518             break;
3519         }
3520
3521         rs->target_page_count += pages;
3522
3523         /* we want to check in the 1st loop, just in case it was the 1st time
3524            and we had to sync the dirty bitmap.
3525            qemu_clock_get_ns() is a bit expensive, so we only check each some
3526            iterations
3527         */
3528         if ((i & 63) == 0) {
3529             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3530             if (t1 > MAX_WAIT) {
3531                 trace_ram_save_iterate_big_wait(t1, i);
3532                 break;
3533             }
3534         }
3535         i++;
3536     }
3537     rcu_read_unlock();
3538
3539     /*
3540      * Must occur before EOS (or any QEMUFile operation)
3541      * because of RDMA protocol.
3542      */
3543     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3544
3545 out:
3546     multifd_send_sync_main(rs);
3547     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3548     qemu_fflush(f);
3549     ram_counters.transferred += 8;
3550
3551     ret = qemu_file_get_error(f);
3552     if (ret < 0) {
3553         return ret;
3554     }
3555
3556     return done;
3557 }
3558
3559 /**
3560  * ram_save_complete: function called to send the remaining amount of ram
3561  *
3562  * Returns zero to indicate success or negative on error
3563  *
3564  * Called with iothread lock
3565  *
3566  * @f: QEMUFile where to send the data
3567  * @opaque: RAMState pointer
3568  */
3569 static int ram_save_complete(QEMUFile *f, void *opaque)
3570 {
3571     RAMState **temp = opaque;
3572     RAMState *rs = *temp;
3573     int ret = 0;
3574
3575     rcu_read_lock();
3576
3577     if (!migration_in_postcopy()) {
3578         migration_bitmap_sync_precopy(rs);
3579     }
3580
3581     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3582
3583     /* try transferring iterative blocks of memory */
3584
3585     /* flush all remaining blocks regardless of rate limiting */
3586     while (true) {
3587         int pages;
3588
3589         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3590         /* no more blocks to sent */
3591         if (pages == 0) {
3592             break;
3593         }
3594         if (pages < 0) {
3595             ret = pages;
3596             break;
3597         }
3598     }
3599
3600     flush_compressed_data(rs);
3601     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3602
3603     rcu_read_unlock();
3604
3605     multifd_send_sync_main(rs);
3606     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3607     qemu_fflush(f);
3608
3609     return ret;
3610 }
3611
3612 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3613                              uint64_t *res_precopy_only,
3614                              uint64_t *res_compatible,
3615                              uint64_t *res_postcopy_only)
3616 {
3617     RAMState **temp = opaque;
3618     RAMState *rs = *temp;
3619     uint64_t remaining_size;
3620
3621     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3622
3623     if (!migration_in_postcopy() &&
3624         remaining_size < max_size) {
3625         qemu_mutex_lock_iothread();
3626         rcu_read_lock();
3627         migration_bitmap_sync_precopy(rs);
3628         rcu_read_unlock();
3629         qemu_mutex_unlock_iothread();
3630         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3631     }
3632
3633     if (migrate_postcopy_ram()) {
3634         /* We can do postcopy, and all the data is postcopiable */
3635         *res_compatible += remaining_size;
3636     } else {
3637         *res_precopy_only += remaining_size;
3638     }
3639 }
3640
3641 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3642 {
3643     unsigned int xh_len;
3644     int xh_flags;
3645     uint8_t *loaded_data;
3646
3647     /* extract RLE header */
3648     xh_flags = qemu_get_byte(f);
3649     xh_len = qemu_get_be16(f);
3650
3651     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3652         error_report("Failed to load XBZRLE page - wrong compression!");
3653         return -1;
3654     }
3655
3656     if (xh_len > TARGET_PAGE_SIZE) {
3657         error_report("Failed to load XBZRLE page - len overflow!");
3658         return -1;
3659     }
3660     loaded_data = XBZRLE.decoded_buf;
3661     /* load data and decode */
3662     /* it can change loaded_data to point to an internal buffer */
3663     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3664
3665     /* decode RLE */
3666     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3667                              TARGET_PAGE_SIZE) == -1) {
3668         error_report("Failed to load XBZRLE page - decode error!");
3669         return -1;
3670     }
3671
3672     return 0;
3673 }
3674
3675 /**
3676  * ram_block_from_stream: read a RAMBlock id from the migration stream
3677  *
3678  * Must be called from within a rcu critical section.
3679  *
3680  * Returns a pointer from within the RCU-protected ram_list.
3681  *
3682  * @f: QEMUFile where to read the data from
3683  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3684  */
3685 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3686 {
3687     static RAMBlock *block = NULL;
3688     char id[256];
3689     uint8_t len;
3690
3691     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3692         if (!block) {
3693             error_report("Ack, bad migration stream!");
3694             return NULL;
3695         }
3696         return block;
3697     }
3698
3699     len = qemu_get_byte(f);
3700     qemu_get_buffer(f, (uint8_t *)id, len);
3701     id[len] = 0;
3702
3703     block = qemu_ram_block_by_name(id);
3704     if (!block) {
3705         error_report("Can't find block %s", id);
3706         return NULL;
3707     }
3708
3709     if (ramblock_is_ignored(block)) {
3710         error_report("block %s should not be migrated !", id);
3711         return NULL;
3712     }
3713
3714     return block;
3715 }
3716
3717 static inline void *host_from_ram_block_offset(RAMBlock *block,
3718                                                ram_addr_t offset)
3719 {
3720     if (!offset_in_ramblock(block, offset)) {
3721         return NULL;
3722     }
3723
3724     return block->host + offset;
3725 }
3726
3727 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3728                                                  ram_addr_t offset)
3729 {
3730     if (!offset_in_ramblock(block, offset)) {
3731         return NULL;
3732     }
3733     if (!block->colo_cache) {
3734         error_report("%s: colo_cache is NULL in block :%s",
3735                      __func__, block->idstr);
3736         return NULL;
3737     }
3738
3739     /*
3740     * During colo checkpoint, we need bitmap of these migrated pages.
3741     * It help us to decide which pages in ram cache should be flushed
3742     * into VM's RAM later.
3743     */
3744     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3745         ram_state->migration_dirty_pages++;
3746     }
3747     return block->colo_cache + offset;
3748 }
3749
3750 /**
3751  * ram_handle_compressed: handle the zero page case
3752  *
3753  * If a page (or a whole RDMA chunk) has been
3754  * determined to be zero, then zap it.
3755  *
3756  * @host: host address for the zero page
3757  * @ch: what the page is filled from.  We only support zero
3758  * @size: size of the zero page
3759  */
3760 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3761 {
3762     if (ch != 0 || !is_zero_range(host, size)) {
3763         memset(host, ch, size);
3764     }
3765 }
3766
3767 /* return the size after decompression, or negative value on error */
3768 static int
3769 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3770                      const uint8_t *source, size_t source_len)
3771 {
3772     int err;
3773
3774     err = inflateReset(stream);
3775     if (err != Z_OK) {
3776         return -1;
3777     }
3778
3779     stream->avail_in = source_len;
3780     stream->next_in = (uint8_t *)source;
3781     stream->avail_out = dest_len;
3782     stream->next_out = dest;
3783
3784     err = inflate(stream, Z_NO_FLUSH);
3785     if (err != Z_STREAM_END) {
3786         return -1;
3787     }
3788
3789     return stream->total_out;
3790 }
3791
3792 static void *do_data_decompress(void *opaque)
3793 {
3794     DecompressParam *param = opaque;
3795     unsigned long pagesize;
3796     uint8_t *des;
3797     int len, ret;
3798
3799     qemu_mutex_lock(&param->mutex);
3800     while (!param->quit) {
3801         if (param->des) {
3802             des = param->des;
3803             len = param->len;
3804             param->des = 0;
3805             qemu_mutex_unlock(&param->mutex);
3806
3807             pagesize = TARGET_PAGE_SIZE;
3808
3809             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3810                                        param->compbuf, len);
3811             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3812                 error_report("decompress data failed");
3813                 qemu_file_set_error(decomp_file, ret);
3814             }
3815
3816             qemu_mutex_lock(&decomp_done_lock);
3817             param->done = true;
3818             qemu_cond_signal(&decomp_done_cond);
3819             qemu_mutex_unlock(&decomp_done_lock);
3820
3821             qemu_mutex_lock(&param->mutex);
3822         } else {
3823             qemu_cond_wait(&param->cond, &param->mutex);
3824         }
3825     }
3826     qemu_mutex_unlock(&param->mutex);
3827
3828     return NULL;
3829 }
3830
3831 static int wait_for_decompress_done(void)
3832 {
3833     int idx, thread_count;
3834
3835     if (!migrate_use_compression()) {
3836         return 0;
3837     }
3838
3839     thread_count = migrate_decompress_threads();
3840     qemu_mutex_lock(&decomp_done_lock);
3841     for (idx = 0; idx < thread_count; idx++) {
3842         while (!decomp_param[idx].done) {
3843             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3844         }
3845     }
3846     qemu_mutex_unlock(&decomp_done_lock);
3847     return qemu_file_get_error(decomp_file);
3848 }
3849
3850 static void compress_threads_load_cleanup(void)
3851 {
3852     int i, thread_count;
3853
3854     if (!migrate_use_compression()) {
3855         return;
3856     }
3857     thread_count = migrate_decompress_threads();
3858     for (i = 0; i < thread_count; i++) {
3859         /*
3860          * we use it as a indicator which shows if the thread is
3861          * properly init'd or not
3862          */
3863         if (!decomp_param[i].compbuf) {
3864             break;
3865         }
3866
3867         qemu_mutex_lock(&decomp_param[i].mutex);
3868         decomp_param[i].quit = true;
3869         qemu_cond_signal(&decomp_param[i].cond);
3870         qemu_mutex_unlock(&decomp_param[i].mutex);
3871     }
3872     for (i = 0; i < thread_count; i++) {
3873         if (!decomp_param[i].compbuf) {
3874             break;
3875         }
3876
3877         qemu_thread_join(decompress_threads + i);
3878         qemu_mutex_destroy(&decomp_param[i].mutex);
3879         qemu_cond_destroy(&decomp_param[i].cond);
3880         inflateEnd(&decomp_param[i].stream);
3881         g_free(decomp_param[i].compbuf);
3882         decomp_param[i].compbuf = NULL;
3883     }
3884     g_free(decompress_threads);
3885     g_free(decomp_param);
3886     decompress_threads = NULL;
3887     decomp_param = NULL;
3888     decomp_file = NULL;
3889 }
3890
3891 static int compress_threads_load_setup(QEMUFile *f)
3892 {
3893     int i, thread_count;
3894
3895     if (!migrate_use_compression()) {
3896         return 0;
3897     }
3898
3899     thread_count = migrate_decompress_threads();
3900     decompress_threads = g_new0(QemuThread, thread_count);
3901     decomp_param = g_new0(DecompressParam, thread_count);
3902     qemu_mutex_init(&decomp_done_lock);
3903     qemu_cond_init(&decomp_done_cond);
3904     decomp_file = f;
3905     for (i = 0; i < thread_count; i++) {
3906         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3907             goto exit;
3908         }
3909
3910         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3911         qemu_mutex_init(&decomp_param[i].mutex);
3912         qemu_cond_init(&decomp_param[i].cond);
3913         decomp_param[i].done = true;
3914         decomp_param[i].quit = false;
3915         qemu_thread_create(decompress_threads + i, "decompress",
3916                            do_data_decompress, decomp_param + i,
3917                            QEMU_THREAD_JOINABLE);
3918     }
3919     return 0;
3920 exit:
3921     compress_threads_load_cleanup();
3922     return -1;
3923 }
3924
3925 static void decompress_data_with_multi_threads(QEMUFile *f,
3926                                                void *host, int len)
3927 {
3928     int idx, thread_count;
3929
3930     thread_count = migrate_decompress_threads();
3931     qemu_mutex_lock(&decomp_done_lock);
3932     while (true) {
3933         for (idx = 0; idx < thread_count; idx++) {
3934             if (decomp_param[idx].done) {
3935                 decomp_param[idx].done = false;
3936                 qemu_mutex_lock(&decomp_param[idx].mutex);
3937                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3938                 decomp_param[idx].des = host;
3939                 decomp_param[idx].len = len;
3940                 qemu_cond_signal(&decomp_param[idx].cond);
3941                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3942                 break;
3943             }
3944         }
3945         if (idx < thread_count) {
3946             break;
3947         } else {
3948             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3949         }
3950     }
3951     qemu_mutex_unlock(&decomp_done_lock);
3952 }
3953
3954 /*
3955  * colo cache: this is for secondary VM, we cache the whole
3956  * memory of the secondary VM, it is need to hold the global lock
3957  * to call this helper.
3958  */
3959 int colo_init_ram_cache(void)
3960 {
3961     RAMBlock *block;
3962
3963     rcu_read_lock();
3964     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3965         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3966                                                 NULL,
3967                                                 false);
3968         if (!block->colo_cache) {
3969             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3970                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3971                          block->used_length);
3972             goto out_locked;
3973         }
3974         memcpy(block->colo_cache, block->host, block->used_length);
3975     }
3976     rcu_read_unlock();
3977     /*
3978     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3979     * with to decide which page in cache should be flushed into SVM's RAM. Here
3980     * we use the same name 'ram_bitmap' as for migration.
3981     */
3982     if (ram_bytes_total()) {
3983         RAMBlock *block;
3984
3985         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3986             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3987
3988             block->bmap = bitmap_new(pages);
3989             bitmap_set(block->bmap, 0, pages);
3990         }
3991     }
3992     ram_state = g_new0(RAMState, 1);
3993     ram_state->migration_dirty_pages = 0;
3994     qemu_mutex_init(&ram_state->bitmap_mutex);
3995     memory_global_dirty_log_start();
3996
3997     return 0;
3998
3999 out_locked:
4000
4001     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4002         if (block->colo_cache) {
4003             qemu_anon_ram_free(block->colo_cache, block->used_length);
4004             block->colo_cache = NULL;
4005         }
4006     }
4007
4008     rcu_read_unlock();
4009     return -errno;
4010 }
4011
4012 /* It is need to hold the global lock to call this helper */
4013 void colo_release_ram_cache(void)
4014 {
4015     RAMBlock *block;
4016
4017     memory_global_dirty_log_stop();
4018     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4019         g_free(block->bmap);
4020         block->bmap = NULL;
4021     }
4022
4023     rcu_read_lock();
4024
4025     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4026         if (block->colo_cache) {
4027             qemu_anon_ram_free(block->colo_cache, block->used_length);
4028             block->colo_cache = NULL;
4029         }
4030     }
4031
4032     rcu_read_unlock();
4033     qemu_mutex_destroy(&ram_state->bitmap_mutex);
4034     g_free(ram_state);
4035     ram_state = NULL;
4036 }
4037
4038 /**
4039  * ram_load_setup: Setup RAM for migration incoming side
4040  *
4041  * Returns zero to indicate success and negative for error
4042  *
4043  * @f: QEMUFile where to receive the data
4044  * @opaque: RAMState pointer
4045  */
4046 static int ram_load_setup(QEMUFile *f, void *opaque)
4047 {
4048     if (compress_threads_load_setup(f)) {
4049         return -1;
4050     }
4051
4052     xbzrle_load_setup();
4053     ramblock_recv_map_init();
4054
4055     return 0;
4056 }
4057
4058 static int ram_load_cleanup(void *opaque)
4059 {
4060     RAMBlock *rb;
4061
4062     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4063         if (ramblock_is_pmem(rb)) {
4064             pmem_persist(rb->host, rb->used_length);
4065         }
4066     }
4067
4068     xbzrle_load_cleanup();
4069     compress_threads_load_cleanup();
4070
4071     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4072         g_free(rb->receivedmap);
4073         rb->receivedmap = NULL;
4074     }
4075
4076     return 0;
4077 }
4078
4079 /**
4080  * ram_postcopy_incoming_init: allocate postcopy data structures
4081  *
4082  * Returns 0 for success and negative if there was one error
4083  *
4084  * @mis: current migration incoming state
4085  *
4086  * Allocate data structures etc needed by incoming migration with
4087  * postcopy-ram. postcopy-ram's similarly names
4088  * postcopy_ram_incoming_init does the work.
4089  */
4090 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4091 {
4092     return postcopy_ram_incoming_init(mis);
4093 }
4094
4095 /**
4096  * ram_load_postcopy: load a page in postcopy case
4097  *
4098  * Returns 0 for success or -errno in case of error
4099  *
4100  * Called in postcopy mode by ram_load().
4101  * rcu_read_lock is taken prior to this being called.
4102  *
4103  * @f: QEMUFile where to send the data
4104  */
4105 static int ram_load_postcopy(QEMUFile *f)
4106 {
4107     int flags = 0, ret = 0;
4108     bool place_needed = false;
4109     bool matches_target_page_size = false;
4110     MigrationIncomingState *mis = migration_incoming_get_current();
4111     /* Temporary page that is later 'placed' */
4112     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4113     void *last_host = NULL;
4114     bool all_zero = false;
4115
4116     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4117         ram_addr_t addr;
4118         void *host = NULL;
4119         void *page_buffer = NULL;
4120         void *place_source = NULL;
4121         RAMBlock *block = NULL;
4122         uint8_t ch;
4123
4124         addr = qemu_get_be64(f);
4125
4126         /*
4127          * If qemu file error, we should stop here, and then "addr"
4128          * may be invalid
4129          */
4130         ret = qemu_file_get_error(f);
4131         if (ret) {
4132             break;
4133         }
4134
4135         flags = addr & ~TARGET_PAGE_MASK;
4136         addr &= TARGET_PAGE_MASK;
4137
4138         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4139         place_needed = false;
4140         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4141             block = ram_block_from_stream(f, flags);
4142
4143             host = host_from_ram_block_offset(block, addr);
4144             if (!host) {
4145                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4146                 ret = -EINVAL;
4147                 break;
4148             }
4149             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4150             /*
4151              * Postcopy requires that we place whole host pages atomically;
4152              * these may be huge pages for RAMBlocks that are backed by
4153              * hugetlbfs.
4154              * To make it atomic, the data is read into a temporary page
4155              * that's moved into place later.
4156              * The migration protocol uses,  possibly smaller, target-pages
4157              * however the source ensures it always sends all the components
4158              * of a host page in order.
4159              */
4160             page_buffer = postcopy_host_page +
4161                           ((uintptr_t)host & (block->page_size - 1));
4162             /* If all TP are zero then we can optimise the place */
4163             if (!((uintptr_t)host & (block->page_size - 1))) {
4164                 all_zero = true;
4165             } else {
4166                 /* not the 1st TP within the HP */
4167                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4168                     error_report("Non-sequential target page %p/%p",
4169                                   host, last_host);
4170                     ret = -EINVAL;
4171                     break;
4172                 }
4173             }
4174
4175
4176             /*
4177              * If it's the last part of a host page then we place the host
4178              * page
4179              */
4180             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4181                                      (block->page_size - 1)) == 0;
4182             place_source = postcopy_host_page;
4183         }
4184         last_host = host;
4185
4186         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4187         case RAM_SAVE_FLAG_ZERO:
4188             ch = qemu_get_byte(f);
4189             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4190             if (ch) {
4191                 all_zero = false;
4192             }
4193             break;
4194
4195         case RAM_SAVE_FLAG_PAGE:
4196             all_zero = false;
4197             if (!matches_target_page_size) {
4198                 /* For huge pages, we always use temporary buffer */
4199                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4200             } else {
4201                 /*
4202                  * For small pages that matches target page size, we
4203                  * avoid the qemu_file copy.  Instead we directly use
4204                  * the buffer of QEMUFile to place the page.  Note: we
4205                  * cannot do any QEMUFile operation before using that
4206                  * buffer to make sure the buffer is valid when
4207                  * placing the page.
4208                  */
4209                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4210                                          TARGET_PAGE_SIZE);
4211             }
4212             break;
4213         case RAM_SAVE_FLAG_EOS:
4214             /* normal exit */
4215             multifd_recv_sync_main();
4216             break;
4217         default:
4218             error_report("Unknown combination of migration flags: %#x"
4219                          " (postcopy mode)", flags);
4220             ret = -EINVAL;
4221             break;
4222         }
4223
4224         /* Detect for any possible file errors */
4225         if (!ret && qemu_file_get_error(f)) {
4226             ret = qemu_file_get_error(f);
4227         }
4228
4229         if (!ret && place_needed) {
4230             /* This gets called at the last target page in the host page */
4231             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4232
4233             if (all_zero) {
4234                 ret = postcopy_place_page_zero(mis, place_dest,
4235                                                block);
4236             } else {
4237                 ret = postcopy_place_page(mis, place_dest,
4238                                           place_source, block);
4239             }
4240         }
4241     }
4242
4243     return ret;
4244 }
4245
4246 static bool postcopy_is_advised(void)
4247 {
4248     PostcopyState ps = postcopy_state_get();
4249     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4250 }
4251
4252 static bool postcopy_is_running(void)
4253 {
4254     PostcopyState ps = postcopy_state_get();
4255     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4256 }
4257
4258 /*
4259  * Flush content of RAM cache into SVM's memory.
4260  * Only flush the pages that be dirtied by PVM or SVM or both.
4261  */
4262 static void colo_flush_ram_cache(void)
4263 {
4264     RAMBlock *block = NULL;
4265     void *dst_host;
4266     void *src_host;
4267     unsigned long offset = 0;
4268
4269     memory_global_dirty_log_sync();
4270     rcu_read_lock();
4271     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4272         ramblock_sync_dirty_bitmap(ram_state, block);
4273     }
4274     rcu_read_unlock();
4275
4276     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4277     rcu_read_lock();
4278     block = QLIST_FIRST_RCU(&ram_list.blocks);
4279
4280     while (block) {
4281         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4282
4283         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4284             offset = 0;
4285             block = QLIST_NEXT_RCU(block, next);
4286         } else {
4287             migration_bitmap_clear_dirty(ram_state, block, offset);
4288             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4289             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4290             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4291         }
4292     }
4293
4294     rcu_read_unlock();
4295     trace_colo_flush_ram_cache_end();
4296 }
4297
4298 /**
4299  * ram_load_precopy: load pages in precopy case
4300  *
4301  * Returns 0 for success or -errno in case of error
4302  *
4303  * Called in precopy mode by ram_load().
4304  * rcu_read_lock is taken prior to this being called.
4305  *
4306  * @f: QEMUFile where to send the data
4307  */
4308 static int ram_load_precopy(QEMUFile *f)
4309 {
4310     int flags = 0, ret = 0, invalid_flags = 0, len = 0;
4311     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4312     bool postcopy_advised = postcopy_is_advised();
4313     if (!migrate_use_compression()) {
4314         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4315     }
4316
4317     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4318         ram_addr_t addr, total_ram_bytes;
4319         void *host = NULL;
4320         uint8_t ch;
4321
4322         addr = qemu_get_be64(f);
4323         flags = addr & ~TARGET_PAGE_MASK;
4324         addr &= TARGET_PAGE_MASK;
4325
4326         if (flags & invalid_flags) {
4327             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4328                 error_report("Received an unexpected compressed page");
4329             }
4330
4331             ret = -EINVAL;
4332             break;
4333         }
4334
4335         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4336                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4337             RAMBlock *block = ram_block_from_stream(f, flags);
4338
4339             /*
4340              * After going into COLO, we should load the Page into colo_cache.
4341              */
4342             if (migration_incoming_in_colo_state()) {
4343                 host = colo_cache_from_block_offset(block, addr);
4344             } else {
4345                 host = host_from_ram_block_offset(block, addr);
4346             }
4347             if (!host) {
4348                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4349                 ret = -EINVAL;
4350                 break;
4351             }
4352
4353             if (!migration_incoming_in_colo_state()) {
4354                 ramblock_recv_bitmap_set(block, host);
4355             }
4356
4357             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4358         }
4359
4360         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4361         case RAM_SAVE_FLAG_MEM_SIZE:
4362             /* Synchronize RAM block list */
4363             total_ram_bytes = addr;
4364             while (!ret && total_ram_bytes) {
4365                 RAMBlock *block;
4366                 char id[256];
4367                 ram_addr_t length;
4368
4369                 len = qemu_get_byte(f);
4370                 qemu_get_buffer(f, (uint8_t *)id, len);
4371                 id[len] = 0;
4372                 length = qemu_get_be64(f);
4373
4374                 block = qemu_ram_block_by_name(id);
4375                 if (block && !qemu_ram_is_migratable(block)) {
4376                     error_report("block %s should not be migrated !", id);
4377                     ret = -EINVAL;
4378                 } else if (block) {
4379                     if (length != block->used_length) {
4380                         Error *local_err = NULL;
4381
4382                         ret = qemu_ram_resize(block, length,
4383                                               &local_err);
4384                         if (local_err) {
4385                             error_report_err(local_err);
4386                         }
4387                     }
4388                     /* For postcopy we need to check hugepage sizes match */
4389                     if (postcopy_advised &&
4390                         block->page_size != qemu_host_page_size) {
4391                         uint64_t remote_page_size = qemu_get_be64(f);
4392                         if (remote_page_size != block->page_size) {
4393                             error_report("Mismatched RAM page size %s "
4394                                          "(local) %zd != %" PRId64,
4395                                          id, block->page_size,
4396                                          remote_page_size);
4397                             ret = -EINVAL;
4398                         }
4399                     }
4400                     if (migrate_ignore_shared()) {
4401                         hwaddr addr = qemu_get_be64(f);
4402                         if (ramblock_is_ignored(block) &&
4403                             block->mr->addr != addr) {
4404                             error_report("Mismatched GPAs for block %s "
4405                                          "%" PRId64 "!= %" PRId64,
4406                                          id, (uint64_t)addr,
4407                                          (uint64_t)block->mr->addr);
4408                             ret = -EINVAL;
4409                         }
4410                     }
4411                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4412                                           block->idstr);
4413                 } else {
4414                     error_report("Unknown ramblock \"%s\", cannot "
4415                                  "accept migration", id);
4416                     ret = -EINVAL;
4417                 }
4418
4419                 total_ram_bytes -= length;
4420             }
4421             break;
4422
4423         case RAM_SAVE_FLAG_ZERO:
4424             ch = qemu_get_byte(f);
4425             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4426             break;
4427
4428         case RAM_SAVE_FLAG_PAGE:
4429             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4430             break;
4431
4432         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4433             len = qemu_get_be32(f);
4434             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4435                 error_report("Invalid compressed data length: %d", len);
4436                 ret = -EINVAL;
4437                 break;
4438             }
4439             decompress_data_with_multi_threads(f, host, len);
4440             break;
4441
4442         case RAM_SAVE_FLAG_XBZRLE:
4443             if (load_xbzrle(f, addr, host) < 0) {
4444                 error_report("Failed to decompress XBZRLE page at "
4445                              RAM_ADDR_FMT, addr);
4446                 ret = -EINVAL;
4447                 break;
4448             }
4449             break;
4450         case RAM_SAVE_FLAG_EOS:
4451             /* normal exit */
4452             multifd_recv_sync_main();
4453             break;
4454         default:
4455             if (flags & RAM_SAVE_FLAG_HOOK) {
4456                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4457             } else {
4458                 error_report("Unknown combination of migration flags: %#x",
4459                              flags);
4460                 ret = -EINVAL;
4461             }
4462         }
4463         if (!ret) {
4464             ret = qemu_file_get_error(f);
4465         }
4466     }
4467
4468     return ret;
4469 }
4470
4471 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4472 {
4473     int ret = 0;
4474     static uint64_t seq_iter;
4475     /*
4476      * If system is running in postcopy mode, page inserts to host memory must
4477      * be atomic
4478      */
4479     bool postcopy_running = postcopy_is_running();
4480
4481     seq_iter++;
4482
4483     if (version_id != 4) {
4484         return -EINVAL;
4485     }
4486
4487     /*
4488      * This RCU critical section can be very long running.
4489      * When RCU reclaims in the code start to become numerous,
4490      * it will be necessary to reduce the granularity of this
4491      * critical section.
4492      */
4493     rcu_read_lock();
4494
4495     if (postcopy_running) {
4496         ret = ram_load_postcopy(f);
4497     } else {
4498         ret = ram_load_precopy(f);
4499     }
4500
4501     ret |= wait_for_decompress_done();
4502     rcu_read_unlock();
4503     trace_ram_load_complete(ret, seq_iter);
4504
4505     if (!ret  && migration_incoming_in_colo_state()) {
4506         colo_flush_ram_cache();
4507     }
4508     return ret;
4509 }
4510
4511 static bool ram_has_postcopy(void *opaque)
4512 {
4513     RAMBlock *rb;
4514     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4515         if (ramblock_is_pmem(rb)) {
4516             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4517                          "is not supported now!", rb->idstr, rb->host);
4518             return false;
4519         }
4520     }
4521
4522     return migrate_postcopy_ram();
4523 }
4524
4525 /* Sync all the dirty bitmap with destination VM.  */
4526 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4527 {
4528     RAMBlock *block;
4529     QEMUFile *file = s->to_dst_file;
4530     int ramblock_count = 0;
4531
4532     trace_ram_dirty_bitmap_sync_start();
4533
4534     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4535         qemu_savevm_send_recv_bitmap(file, block->idstr);
4536         trace_ram_dirty_bitmap_request(block->idstr);
4537         ramblock_count++;
4538     }
4539
4540     trace_ram_dirty_bitmap_sync_wait();
4541
4542     /* Wait until all the ramblocks' dirty bitmap synced */
4543     while (ramblock_count--) {
4544         qemu_sem_wait(&s->rp_state.rp_sem);
4545     }
4546
4547     trace_ram_dirty_bitmap_sync_complete();
4548
4549     return 0;
4550 }
4551
4552 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4553 {
4554     qemu_sem_post(&s->rp_state.rp_sem);
4555 }
4556
4557 /*
4558  * Read the received bitmap, revert it as the initial dirty bitmap.
4559  * This is only used when the postcopy migration is paused but wants
4560  * to resume from a middle point.
4561  */
4562 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4563 {
4564     int ret = -EINVAL;
4565     QEMUFile *file = s->rp_state.from_dst_file;
4566     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4567     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4568     uint64_t size, end_mark;
4569
4570     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4571
4572     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4573         error_report("%s: incorrect state %s", __func__,
4574                      MigrationStatus_str(s->state));
4575         return -EINVAL;
4576     }
4577
4578     /*
4579      * Note: see comments in ramblock_recv_bitmap_send() on why we
4580      * need the endianess convertion, and the paddings.
4581      */
4582     local_size = ROUND_UP(local_size, 8);
4583
4584     /* Add paddings */
4585     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4586
4587     size = qemu_get_be64(file);
4588
4589     /* The size of the bitmap should match with our ramblock */
4590     if (size != local_size) {
4591         error_report("%s: ramblock '%s' bitmap size mismatch "
4592                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4593                      block->idstr, size, local_size);
4594         ret = -EINVAL;
4595         goto out;
4596     }
4597
4598     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4599     end_mark = qemu_get_be64(file);
4600
4601     ret = qemu_file_get_error(file);
4602     if (ret || size != local_size) {
4603         error_report("%s: read bitmap failed for ramblock '%s': %d"
4604                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4605                      __func__, block->idstr, ret, local_size, size);
4606         ret = -EIO;
4607         goto out;
4608     }
4609
4610     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4611         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4612                      __func__, block->idstr, end_mark);
4613         ret = -EINVAL;
4614         goto out;
4615     }
4616
4617     /*
4618      * Endianess convertion. We are during postcopy (though paused).
4619      * The dirty bitmap won't change. We can directly modify it.
4620      */
4621     bitmap_from_le(block->bmap, le_bitmap, nbits);
4622
4623     /*
4624      * What we received is "received bitmap". Revert it as the initial
4625      * dirty bitmap for this ramblock.
4626      */
4627     bitmap_complement(block->bmap, block->bmap, nbits);
4628
4629     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4630
4631     /*
4632      * We succeeded to sync bitmap for current ramblock. If this is
4633      * the last one to sync, we need to notify the main send thread.
4634      */
4635     ram_dirty_bitmap_reload_notify(s);
4636
4637     ret = 0;
4638 out:
4639     g_free(le_bitmap);
4640     return ret;
4641 }
4642
4643 static int ram_resume_prepare(MigrationState *s, void *opaque)
4644 {
4645     RAMState *rs = *(RAMState **)opaque;
4646     int ret;
4647
4648     ret = ram_dirty_bitmap_sync_all(s, rs);
4649     if (ret) {
4650         return ret;
4651     }
4652
4653     ram_state_resume_prepare(rs, s->to_dst_file);
4654
4655     return 0;
4656 }
4657
4658 static SaveVMHandlers savevm_ram_handlers = {
4659     .save_setup = ram_save_setup,
4660     .save_live_iterate = ram_save_iterate,
4661     .save_live_complete_postcopy = ram_save_complete,
4662     .save_live_complete_precopy = ram_save_complete,
4663     .has_postcopy = ram_has_postcopy,
4664     .save_live_pending = ram_save_pending,
4665     .load_state = ram_load,
4666     .save_cleanup = ram_save_cleanup,
4667     .load_setup = ram_load_setup,
4668     .load_cleanup = ram_load_cleanup,
4669     .resume_prepare = ram_resume_prepare,
4670 };
4671
4672 void ram_mig_init(void)
4673 {
4674     qemu_mutex_init(&XBZRLE.lock);
4675     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4676 }