migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <[email protected]>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 static bool ramblock_is_ignored(RAMBlock *block)
 163 {
 164     return !qemu_ram_is_migratable(block) ||
 165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 166 }
 167
 168 /* Should be holding either ram_list.mutex, or the RCU lock. */
 169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 171         if (ramblock_is_ignored(block)) {} else
 172
 173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 175         if (!qemu_ram_is_migratable(block)) {} else
 176
 177 #undef RAMBLOCK_FOREACH
 178
 179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 180 {
 181     RAMBlock *block;
 182     int ret = 0;
 183
 184     rcu_read_lock();
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     rcu_read_unlock();
 192     return ret;
 193 }
 194
 195 static void ramblock_recv_map_init(void)
 196 {
 197     RAMBlock *rb;
 198
 199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 200         assert(!rb->receivedmap);
 201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 202     }
 203 }
 204
 205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 206 {
 207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 208                     rb->receivedmap);
 209 }
 210
 211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 212 {
 213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 217 {
 218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 219 }
 220
 221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 222                                     size_t nr)
 223 {
 224     bitmap_set_atomic(rb->receivedmap,
 225                       ramblock_recv_bitmap_offset(host_addr, rb),
 226                       nr);
 227 }
 228
 229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 230
 231 /*
 232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 233  *
 234  * Returns >0 if success with sent bytes, or <0 if error.
 235  */
 236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 237                                   const char *block_name)
 238 {
 239     RAMBlock *block = qemu_ram_block_by_name(block_name);
 240     unsigned long *le_bitmap, nbits;
 241     uint64_t size;
 242
 243     if (!block) {
 244         error_report("%s: invalid block name: %s", __func__, block_name);
 245         return -1;
 246     }
 247
 248     nbits = block->used_length >> TARGET_PAGE_BITS;
 249
 250     /*
 251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 252      * machines we may need 4 more bytes for padding (see below
 253      * comment). So extend it a bit before hand.
 254      */
 255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 256
 257     /*
 258      * Always use little endian when sending the bitmap. This is
 259      * required that when source and destination VMs are not using the
 260      * same endianess. (Note: big endian won't work.)
 261      */
 262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 263
 264     /* Size of the bitmap, in bytes */
 265     size = DIV_ROUND_UP(nbits, 8);
 266
 267     /*
 268      * size is always aligned to 8 bytes for 64bit machines, but it
 269      * may not be true for 32bit machines. We need this padding to
 270      * make sure the migration can survive even between 32bit and
 271      * 64bit machines.
 272      */
 273     size = ROUND_UP(size, 8);
 274
 275     qemu_put_be64(file, size);
 276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 277     /*
 278      * Mark as an end, in case the middle part is screwed up due to
 279      * some "misterious" reason.
 280      */
 281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 282     qemu_fflush(file);
 283
 284     g_free(le_bitmap);
 285
 286     if (qemu_file_get_error(file)) {
 287         return qemu_file_get_error(file);
 288     }
 289
 290     return size + sizeof(size);
 291 }
 292
 293 /*
 294  * An outstanding page request, on the source, having been received
 295  * and queued
 296  */
 297 struct RAMSrcPageRequest {
 298     RAMBlock *rb;
 299     hwaddr    offset;
 300     hwaddr    len;
 301
 302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 303 };
 304
 305 /* State of RAM for migration */
 306 struct RAMState {
 307     /* QEMUFile used for this migration */
 308     QEMUFile *f;
 309     /* Last block that we have visited searching for dirty pages */
 310     RAMBlock *last_seen_block;
 311     /* Last block from where we have sent data */
 312     RAMBlock *last_sent_block;
 313     /* Last dirty target page we have sent */
 314     ram_addr_t last_page;
 315     /* last ram version we have seen */
 316     uint32_t last_version;
 317     /* We are in the first round */
 318     bool ram_bulk_stage;
 319     /* The free page optimization is enabled */
 320     bool fpo_enabled;
 321     /* How many times we have dirty too many pages */
 322     int dirty_rate_high_cnt;
 323     /* these variables are used for bitmap sync */
 324     /* last time we did a full bitmap_sync */
 325     int64_t time_last_bitmap_sync;
 326     /* bytes transferred at start_time */
 327     uint64_t bytes_xfer_prev;
 328     /* number of dirty pages since start_time */
 329     uint64_t num_dirty_pages_period;
 330     /* xbzrle misses since the beginning of the period */
 331     uint64_t xbzrle_cache_miss_prev;
 332
 333     /* compression statistics since the beginning of the period */
 334     /* amount of count that no free thread to compress data */
 335     uint64_t compress_thread_busy_prev;
 336     /* amount bytes after compression */
 337     uint64_t compressed_size_prev;
 338     /* amount of compressed pages */
 339     uint64_t compress_pages_prev;
 340
 341     /* total handled target pages at the beginning of period */
 342     uint64_t target_page_count_prev;
 343     /* total handled target pages since start */
 344     uint64_t target_page_count;
 345     /* number of dirty bits in the bitmap */
 346     uint64_t migration_dirty_pages;
 347     /* Protects modification of the bitmap and migration dirty pages */
 348     QemuMutex bitmap_mutex;
 349     /* The RAMBlock used in the last src_page_requests */
 350     RAMBlock *last_req_rb;
 351     /* Queue of outstanding page requests from the destination */
 352     QemuMutex src_page_req_mutex;
 353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 354 };
 355 typedef struct RAMState RAMState;
 356
 357 static RAMState *ram_state;
 358
 359 static NotifierWithReturnList precopy_notifier_list;
 360
 361 void precopy_infrastructure_init(void)
 362 {
 363     notifier_with_return_list_init(&precopy_notifier_list);
 364 }
 365
 366 void precopy_add_notifier(NotifierWithReturn *n)
 367 {
 368     notifier_with_return_list_add(&precopy_notifier_list, n);
 369 }
 370
 371 void precopy_remove_notifier(NotifierWithReturn *n)
 372 {
 373     notifier_with_return_remove(n);
 374 }
 375
 376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 377 {
 378     PrecopyNotifyData pnd;
 379     pnd.reason = reason;
 380     pnd.errp = errp;
 381
 382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 383 }
 384
 385 void precopy_enable_free_page_optimization(void)
 386 {
 387     if (!ram_state) {
 388         return;
 389     }
 390
 391     ram_state->fpo_enabled = true;
 392 }
 393
 394 uint64_t ram_bytes_remaining(void)
 395 {
 396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 397                        0;
 398 }
 399
 400 MigrationStats ram_counters;
 401
 402 /* used by the search for pages to send */
 403 struct PageSearchStatus {
 404     /* Current block being searched */
 405     RAMBlock    *block;
 406     /* Current page to search from */
 407     unsigned long page;
 408     /* Set once we wrap around */
 409     bool         complete_round;
 410 };
 411 typedef struct PageSearchStatus PageSearchStatus;
 412
 413 CompressionStats compression_counters;
 414
 415 struct CompressParam {
 416     bool done;
 417     bool quit;
 418     bool zero_page;
 419     QEMUFile *file;
 420     QemuMutex mutex;
 421     QemuCond cond;
 422     RAMBlock *block;
 423     ram_addr_t offset;
 424
 425     /* internally used fields */
 426     z_stream stream;
 427     uint8_t *originbuf;
 428 };
 429 typedef struct CompressParam CompressParam;
 430
 431 struct DecompressParam {
 432     bool done;
 433     bool quit;
 434     QemuMutex mutex;
 435     QemuCond cond;
 436     void *des;
 437     uint8_t *compbuf;
 438     int len;
 439     z_stream stream;
 440 };
 441 typedef struct DecompressParam DecompressParam;
 442
 443 static CompressParam *comp_param;
 444 static QemuThread *compress_threads;
 445 /* comp_done_cond is used to wake up the migration thread when
 446  * one of the compression threads has finished the compression.
 447  * comp_done_lock is used to co-work with comp_done_cond.
 448  */
 449 static QemuMutex comp_done_lock;
 450 static QemuCond comp_done_cond;
 451 /* The empty QEMUFileOps will be used by file in CompressParam */
 452 static const QEMUFileOps empty_ops = { };
 453
 454 static QEMUFile *decomp_file;
 455 static DecompressParam *decomp_param;
 456 static QemuThread *decompress_threads;
 457 static QemuMutex decomp_done_lock;
 458 static QemuCond decomp_done_cond;
 459
 460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 461                                  ram_addr_t offset, uint8_t *source_buf);
 462
 463 static void *do_data_compress(void *opaque)
 464 {
 465     CompressParam *param = opaque;
 466     RAMBlock *block;
 467     ram_addr_t offset;
 468     bool zero_page;
 469
 470     qemu_mutex_lock(&param->mutex);
 471     while (!param->quit) {
 472         if (param->block) {
 473             block = param->block;
 474             offset = param->offset;
 475             param->block = NULL;
 476             qemu_mutex_unlock(&param->mutex);
 477
 478             zero_page = do_compress_ram_page(param->file, &param->stream,
 479                                              block, offset, param->originbuf);
 480
 481             qemu_mutex_lock(&comp_done_lock);
 482             param->done = true;
 483             param->zero_page = zero_page;
 484             qemu_cond_signal(&comp_done_cond);
 485             qemu_mutex_unlock(&comp_done_lock);
 486
 487             qemu_mutex_lock(&param->mutex);
 488         } else {
 489             qemu_cond_wait(&param->cond, &param->mutex);
 490         }
 491     }
 492     qemu_mutex_unlock(&param->mutex);
 493
 494     return NULL;
 495 }
 496
 497 static void compress_threads_save_cleanup(void)
 498 {
 499     int i, thread_count;
 500
 501     if (!migrate_use_compression() || !comp_param) {
 502         return;
 503     }
 504
 505     thread_count = migrate_compress_threads();
 506     for (i = 0; i < thread_count; i++) {
 507         /*
 508          * we use it as a indicator which shows if the thread is
 509          * properly init'd or not
 510          */
 511         if (!comp_param[i].file) {
 512             break;
 513         }
 514
 515         qemu_mutex_lock(&comp_param[i].mutex);
 516         comp_param[i].quit = true;
 517         qemu_cond_signal(&comp_param[i].cond);
 518         qemu_mutex_unlock(&comp_param[i].mutex);
 519
 520         qemu_thread_join(compress_threads + i);
 521         qemu_mutex_destroy(&comp_param[i].mutex);
 522         qemu_cond_destroy(&comp_param[i].cond);
 523         deflateEnd(&comp_param[i].stream);
 524         g_free(comp_param[i].originbuf);
 525         qemu_fclose(comp_param[i].file);
 526         comp_param[i].file = NULL;
 527     }
 528     qemu_mutex_destroy(&comp_done_lock);
 529     qemu_cond_destroy(&comp_done_cond);
 530     g_free(compress_threads);
 531     g_free(comp_param);
 532     compress_threads = NULL;
 533     comp_param = NULL;
 534 }
 535
 536 static int compress_threads_save_setup(void)
 537 {
 538     int i, thread_count;
 539
 540     if (!migrate_use_compression()) {
 541         return 0;
 542     }
 543     thread_count = migrate_compress_threads();
 544     compress_threads = g_new0(QemuThread, thread_count);
 545     comp_param = g_new0(CompressParam, thread_count);
 546     qemu_cond_init(&comp_done_cond);
 547     qemu_mutex_init(&comp_done_lock);
 548     for (i = 0; i < thread_count; i++) {
 549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 550         if (!comp_param[i].originbuf) {
 551             goto exit;
 552         }
 553
 554         if (deflateInit(&comp_param[i].stream,
 555                         migrate_compress_level()) != Z_OK) {
 556             g_free(comp_param[i].originbuf);
 557             goto exit;
 558         }
 559
 560         /* comp_param[i].file is just used as a dummy buffer to save data,
 561          * set its ops to empty.
 562          */
 563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 564         comp_param[i].done = true;
 565         comp_param[i].quit = false;
 566         qemu_mutex_init(&comp_param[i].mutex);
 567         qemu_cond_init(&comp_param[i].cond);
 568         qemu_thread_create(compress_threads + i, "compress",
 569                            do_data_compress, comp_param + i,
 570                            QEMU_THREAD_JOINABLE);
 571     }
 572     return 0;
 573
 574 exit:
 575     compress_threads_save_cleanup();
 576     return -1;
 577 }
 578
 579 /* Multiple fd's */
 580
 581 #define MULTIFD_MAGIC 0x11223344U
 582 #define MULTIFD_VERSION 1
 583
 584 #define MULTIFD_FLAG_SYNC (1 << 0)
 585
 586 /* This value needs to be a multiple of qemu_target_page_size() */
 587 #define MULTIFD_PACKET_SIZE (512 * 1024)
 588
 589 typedef struct {
 590     uint32_t magic;
 591     uint32_t version;
 592     unsigned char uuid[16]; /* QemuUUID */
 593     uint8_t id;
 594     uint8_t unused1[7];     /* Reserved for future use */
 595     uint64_t unused2[4];    /* Reserved for future use */
 596 } __attribute__((packed)) MultiFDInit_t;
 597
 598 typedef struct {
 599     uint32_t magic;
 600     uint32_t version;
 601     uint32_t flags;
 602     /* maximum number of allocated pages */
 603     uint32_t pages_alloc;
 604     uint32_t pages_used;
 605     /* size of the next packet that contains pages */
 606     uint32_t next_packet_size;
 607     uint64_t packet_num;
 608     uint64_t unused[4];    /* Reserved for future use */
 609     char ramblock[256];
 610     uint64_t offset[];
 611 } __attribute__((packed)) MultiFDPacket_t;
 612
 613 typedef struct {
 614     /* number of used pages */
 615     uint32_t used;
 616     /* number of allocated pages */
 617     uint32_t allocated;
 618     /* global number of generated multifd packets */
 619     uint64_t packet_num;
 620     /* offset of each page */
 621     ram_addr_t *offset;
 622     /* pointer to each page */
 623     struct iovec *iov;
 624     RAMBlock *block;
 625 } MultiFDPages_t;
 626
 627 typedef struct {
 628     /* this fields are not changed once the thread is created */
 629     /* channel number */
 630     uint8_t id;
 631     /* channel thread name */
 632     char *name;
 633     /* channel thread id */
 634     QemuThread thread;
 635     /* communication channel */
 636     QIOChannel *c;
 637     /* sem where to wait for more work */
 638     QemuSemaphore sem;
 639     /* this mutex protects the following parameters */
 640     QemuMutex mutex;
 641     /* is this channel thread running */
 642     bool running;
 643     /* should this thread finish */
 644     bool quit;
 645     /* thread has work to do */
 646     int pending_job;
 647     /* array of pages to sent */
 648     MultiFDPages_t *pages;
 649     /* packet allocated len */
 650     uint32_t packet_len;
 651     /* pointer to the packet */
 652     MultiFDPacket_t *packet;
 653     /* multifd flags for each packet */
 654     uint32_t flags;
 655     /* size of the next packet that contains pages */
 656     uint32_t next_packet_size;
 657     /* global number of generated multifd packets */
 658     uint64_t packet_num;
 659     /* thread local variables */
 660     /* packets sent through this channel */
 661     uint64_t num_packets;
 662     /* pages sent through this channel */
 663     uint64_t num_pages;
 664 }  MultiFDSendParams;
 665
 666 typedef struct {
 667     /* this fields are not changed once the thread is created */
 668     /* channel number */
 669     uint8_t id;
 670     /* channel thread name */
 671     char *name;
 672     /* channel thread id */
 673     QemuThread thread;
 674     /* communication channel */
 675     QIOChannel *c;
 676     /* this mutex protects the following parameters */
 677     QemuMutex mutex;
 678     /* is this channel thread running */
 679     bool running;
 680     /* should this thread finish */
 681     bool quit;
 682     /* array of pages to receive */
 683     MultiFDPages_t *pages;
 684     /* packet allocated len */
 685     uint32_t packet_len;
 686     /* pointer to the packet */
 687     MultiFDPacket_t *packet;
 688     /* multifd flags for each packet */
 689     uint32_t flags;
 690     /* global number of generated multifd packets */
 691     uint64_t packet_num;
 692     /* thread local variables */
 693     /* size of the next packet that contains pages */
 694     uint32_t next_packet_size;
 695     /* packets sent through this channel */
 696     uint64_t num_packets;
 697     /* pages sent through this channel */
 698     uint64_t num_pages;
 699     /* syncs main thread and channels */
 700     QemuSemaphore sem_sync;
 701 } MultiFDRecvParams;
 702
 703 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 704 {
 705     MultiFDInit_t msg;
 706     int ret;
 707
 708     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 709     msg.version = cpu_to_be32(MULTIFD_VERSION);
 710     msg.id = p->id;
 711     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 712
 713     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 714     if (ret != 0) {
 715         return -1;
 716     }
 717     return 0;
 718 }
 719
 720 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 721 {
 722     MultiFDInit_t msg;
 723     int ret;
 724
 725     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 726     if (ret != 0) {
 727         return -1;
 728     }
 729
 730     msg.magic = be32_to_cpu(msg.magic);
 731     msg.version = be32_to_cpu(msg.version);
 732
 733     if (msg.magic != MULTIFD_MAGIC) {
 734         error_setg(errp, "multifd: received packet magic %x "
 735                    "expected %x", msg.magic, MULTIFD_MAGIC);
 736         return -1;
 737     }
 738
 739     if (msg.version != MULTIFD_VERSION) {
 740         error_setg(errp, "multifd: received packet version %d "
 741                    "expected %d", msg.version, MULTIFD_VERSION);
 742         return -1;
 743     }
 744
 745     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 746         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 747         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 748
 749         error_setg(errp, "multifd: received uuid '%s' and expected "
 750                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 751         g_free(uuid);
 752         g_free(msg_uuid);
 753         return -1;
 754     }
 755
 756     if (msg.id > migrate_multifd_channels()) {
 757         error_setg(errp, "multifd: received channel version %d "
 758                    "expected %d", msg.version, MULTIFD_VERSION);
 759         return -1;
 760     }
 761
 762     return msg.id;
 763 }
 764
 765 static MultiFDPages_t *multifd_pages_init(size_t size)
 766 {
 767     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 768
 769     pages->allocated = size;
 770     pages->iov = g_new0(struct iovec, size);
 771     pages->offset = g_new0(ram_addr_t, size);
 772
 773     return pages;
 774 }
 775
 776 static void multifd_pages_clear(MultiFDPages_t *pages)
 777 {
 778     pages->used = 0;
 779     pages->allocated = 0;
 780     pages->packet_num = 0;
 781     pages->block = NULL;
 782     g_free(pages->iov);
 783     pages->iov = NULL;
 784     g_free(pages->offset);
 785     pages->offset = NULL;
 786     g_free(pages);
 787 }
 788
 789 static void multifd_send_fill_packet(MultiFDSendParams *p)
 790 {
 791     MultiFDPacket_t *packet = p->packet;
 792     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 793     int i;
 794
 795     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 796     packet->version = cpu_to_be32(MULTIFD_VERSION);
 797     packet->flags = cpu_to_be32(p->flags);
 798     packet->pages_alloc = cpu_to_be32(page_max);
 799     packet->pages_used = cpu_to_be32(p->pages->used);
 800     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 801     packet->packet_num = cpu_to_be64(p->packet_num);
 802
 803     if (p->pages->block) {
 804         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 805     }
 806
 807     for (i = 0; i < p->pages->used; i++) {
 808         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 809     }
 810 }
 811
 812 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 813 {
 814     MultiFDPacket_t *packet = p->packet;
 815     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 816     RAMBlock *block;
 817     int i;
 818
 819     packet->magic = be32_to_cpu(packet->magic);
 820     if (packet->magic != MULTIFD_MAGIC) {
 821         error_setg(errp, "multifd: received packet "
 822                    "magic %x and expected magic %x",
 823                    packet->magic, MULTIFD_MAGIC);
 824         return -1;
 825     }
 826
 827     packet->version = be32_to_cpu(packet->version);
 828     if (packet->version != MULTIFD_VERSION) {
 829         error_setg(errp, "multifd: received packet "
 830                    "version %d and expected version %d",
 831                    packet->version, MULTIFD_VERSION);
 832         return -1;
 833     }
 834
 835     p->flags = be32_to_cpu(packet->flags);
 836
 837     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 838     /*
 839      * If we recevied a packet that is 100 times bigger than expected
 840      * just stop migration.  It is a magic number.
 841      */
 842     if (packet->pages_alloc > pages_max * 100) {
 843         error_setg(errp, "multifd: received packet "
 844                    "with size %d and expected a maximum size of %d",
 845                    packet->pages_alloc, pages_max * 100) ;
 846         return -1;
 847     }
 848     /*
 849      * We received a packet that is bigger than expected but inside
 850      * reasonable limits (see previous comment).  Just reallocate.
 851      */
 852     if (packet->pages_alloc > p->pages->allocated) {
 853         multifd_pages_clear(p->pages);
 854         p->pages = multifd_pages_init(packet->pages_alloc);
 855     }
 856
 857     p->pages->used = be32_to_cpu(packet->pages_used);
 858     if (p->pages->used > packet->pages_alloc) {
 859         error_setg(errp, "multifd: received packet "
 860                    "with %d pages and expected maximum pages are %d",
 861                    p->pages->used, packet->pages_alloc) ;
 862         return -1;
 863     }
 864
 865     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 866     p->packet_num = be64_to_cpu(packet->packet_num);
 867
 868     if (p->pages->used) {
 869         /* make sure that ramblock is 0 terminated */
 870         packet->ramblock[255] = 0;
 871         block = qemu_ram_block_by_name(packet->ramblock);
 872         if (!block) {
 873             error_setg(errp, "multifd: unknown ram block %s",
 874                        packet->ramblock);
 875             return -1;
 876         }
 877     }
 878
 879     for (i = 0; i < p->pages->used; i++) {
 880         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 881
 882         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 883             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 884                        " (max " RAM_ADDR_FMT ")",
 885                        offset, block->max_length);
 886             return -1;
 887         }
 888         p->pages->iov[i].iov_base = block->host + offset;
 889         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 890     }
 891
 892     return 0;
 893 }
 894
 895 struct {
 896     MultiFDSendParams *params;
 897     /* array of pages to sent */
 898     MultiFDPages_t *pages;
 899     /* syncs main thread and channels */
 900     QemuSemaphore sem_sync;
 901     /* global number of generated multifd packets */
 902     uint64_t packet_num;
 903     /* send channels ready */
 904     QemuSemaphore channels_ready;
 905 } *multifd_send_state;
 906
 907 /*
 908  * How we use multifd_send_state->pages and channel->pages?
 909  *
 910  * We create a pages for each channel, and a main one.  Each time that
 911  * we need to send a batch of pages we interchange the ones between
 912  * multifd_send_state and the channel that is sending it.  There are
 913  * two reasons for that:
 914  *    - to not have to do so many mallocs during migration
 915  *    - to make easier to know what to free at the end of migration
 916  *
 917  * This way we always know who is the owner of each "pages" struct,
 918  * and we don't need any locking.  It belongs to the migration thread
 919  * or to the channel thread.  Switching is safe because the migration
 920  * thread is using the channel mutex when changing it, and the channel
 921  * have to had finish with its own, otherwise pending_job can't be
 922  * false.
 923  */
 924
 925 static int multifd_send_pages(RAMState *rs)
 926 {
 927     int i;
 928     static int next_channel;
 929     MultiFDSendParams *p = NULL; /* make happy gcc */
 930     MultiFDPages_t *pages = multifd_send_state->pages;
 931     uint64_t transferred;
 932
 933     qemu_sem_wait(&multifd_send_state->channels_ready);
 934     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 935         p = &multifd_send_state->params[i];
 936
 937         qemu_mutex_lock(&p->mutex);
 938         if (p->quit) {
 939             error_report("%s: channel %d has already quit!", __func__, i);
 940             qemu_mutex_unlock(&p->mutex);
 941             return -1;
 942         }
 943         if (!p->pending_job) {
 944             p->pending_job++;
 945             next_channel = (i + 1) % migrate_multifd_channels();
 946             break;
 947         }
 948         qemu_mutex_unlock(&p->mutex);
 949     }
 950     p->pages->used = 0;
 951
 952     p->packet_num = multifd_send_state->packet_num++;
 953     p->pages->block = NULL;
 954     multifd_send_state->pages = p->pages;
 955     p->pages = pages;
 956     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 957     qemu_file_update_transfer(rs->f, transferred);
 958     ram_counters.multifd_bytes += transferred;
 959     ram_counters.transferred += transferred;;
 960     qemu_mutex_unlock(&p->mutex);
 961     qemu_sem_post(&p->sem);
 962
 963     return 1;
 964 }
 965
 966 static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 967 {
 968     MultiFDPages_t *pages = multifd_send_state->pages;
 969
 970     if (!pages->block) {
 971         pages->block = block;
 972     }
 973
 974     if (pages->block == block) {
 975         pages->offset[pages->used] = offset;
 976         pages->iov[pages->used].iov_base = block->host + offset;
 977         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 978         pages->used++;
 979
 980         if (pages->used < pages->allocated) {
 981             return 1;
 982         }
 983     }
 984
 985     if (multifd_send_pages(rs) < 0) {
 986         return -1;
 987     }
 988
 989     if (pages->block != block) {
 990         return  multifd_queue_page(rs, block, offset);
 991     }
 992
 993     return 1;
 994 }
 995
 996 static void multifd_send_terminate_threads(Error *err)
 997 {
 998     int i;
 999
1000     if (err) {
1001         MigrationState *s = migrate_get_current();
1002         migrate_set_error(s, err);
1003         if (s->state == MIGRATION_STATUS_SETUP ||
1004             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1005             s->state == MIGRATION_STATUS_DEVICE ||
1006             s->state == MIGRATION_STATUS_ACTIVE) {
1007             migrate_set_state(&s->state, s->state,
1008                               MIGRATION_STATUS_FAILED);
1009         }
1010     }
1011
1012     for (i = 0; i < migrate_multifd_channels(); i++) {
1013         MultiFDSendParams *p = &multifd_send_state->params[i];
1014
1015         qemu_mutex_lock(&p->mutex);
1016         p->quit = true;
1017         qemu_sem_post(&p->sem);
1018         qemu_mutex_unlock(&p->mutex);
1019     }
1020 }
1021
1022 void multifd_save_cleanup(void)
1023 {
1024     int i;
1025
1026     if (!migrate_use_multifd()) {
1027         return;
1028     }
1029     multifd_send_terminate_threads(NULL);
1030     for (i = 0; i < migrate_multifd_channels(); i++) {
1031         MultiFDSendParams *p = &multifd_send_state->params[i];
1032
1033         if (p->running) {
1034             qemu_thread_join(&p->thread);
1035         }
1036         socket_send_channel_destroy(p->c);
1037         p->c = NULL;
1038         qemu_mutex_destroy(&p->mutex);
1039         qemu_sem_destroy(&p->sem);
1040         g_free(p->name);
1041         p->name = NULL;
1042         multifd_pages_clear(p->pages);
1043         p->pages = NULL;
1044         p->packet_len = 0;
1045         g_free(p->packet);
1046         p->packet = NULL;
1047     }
1048     qemu_sem_destroy(&multifd_send_state->channels_ready);
1049     qemu_sem_destroy(&multifd_send_state->sem_sync);
1050     g_free(multifd_send_state->params);
1051     multifd_send_state->params = NULL;
1052     multifd_pages_clear(multifd_send_state->pages);
1053     multifd_send_state->pages = NULL;
1054     g_free(multifd_send_state);
1055     multifd_send_state = NULL;
1056 }
1057
1058 static void multifd_send_sync_main(RAMState *rs)
1059 {
1060     int i;
1061
1062     if (!migrate_use_multifd()) {
1063         return;
1064     }
1065     if (multifd_send_state->pages->used) {
1066         if (multifd_send_pages(rs) < 0) {
1067             error_report("%s: multifd_send_pages fail", __func__);
1068             return;
1069         }
1070     }
1071     for (i = 0; i < migrate_multifd_channels(); i++) {
1072         MultiFDSendParams *p = &multifd_send_state->params[i];
1073
1074         trace_multifd_send_sync_main_signal(p->id);
1075
1076         qemu_mutex_lock(&p->mutex);
1077
1078         if (p->quit) {
1079             error_report("%s: channel %d has already quit", __func__, i);
1080             qemu_mutex_unlock(&p->mutex);
1081             return;
1082         }
1083
1084         p->packet_num = multifd_send_state->packet_num++;
1085         p->flags |= MULTIFD_FLAG_SYNC;
1086         p->pending_job++;
1087         qemu_file_update_transfer(rs->f, p->packet_len);
1088         ram_counters.multifd_bytes += p->packet_len;
1089         ram_counters.transferred += p->packet_len;
1090         qemu_mutex_unlock(&p->mutex);
1091         qemu_sem_post(&p->sem);
1092     }
1093     for (i = 0; i < migrate_multifd_channels(); i++) {
1094         MultiFDSendParams *p = &multifd_send_state->params[i];
1095
1096         trace_multifd_send_sync_main_wait(p->id);
1097         qemu_sem_wait(&multifd_send_state->sem_sync);
1098     }
1099     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1100 }
1101
1102 static void *multifd_send_thread(void *opaque)
1103 {
1104     MultiFDSendParams *p = opaque;
1105     Error *local_err = NULL;
1106     int ret = 0;
1107     uint32_t flags = 0;
1108
1109     trace_multifd_send_thread_start(p->id);
1110     rcu_register_thread();
1111
1112     if (multifd_send_initial_packet(p, &local_err) < 0) {
1113         goto out;
1114     }
1115     /* initial packet */
1116     p->num_packets = 1;
1117
1118     while (true) {
1119         qemu_sem_wait(&p->sem);
1120         qemu_mutex_lock(&p->mutex);
1121
1122         if (p->pending_job) {
1123             uint32_t used = p->pages->used;
1124             uint64_t packet_num = p->packet_num;
1125             flags = p->flags;
1126
1127             p->next_packet_size = used * qemu_target_page_size();
1128             multifd_send_fill_packet(p);
1129             p->flags = 0;
1130             p->num_packets++;
1131             p->num_pages += used;
1132             p->pages->used = 0;
1133             qemu_mutex_unlock(&p->mutex);
1134
1135             trace_multifd_send(p->id, packet_num, used, flags,
1136                                p->next_packet_size);
1137
1138             ret = qio_channel_write_all(p->c, (void *)p->packet,
1139                                         p->packet_len, &local_err);
1140             if (ret != 0) {
1141                 break;
1142             }
1143
1144             if (used) {
1145                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1146                                              used, &local_err);
1147                 if (ret != 0) {
1148                     break;
1149                 }
1150             }
1151
1152             qemu_mutex_lock(&p->mutex);
1153             p->pending_job--;
1154             qemu_mutex_unlock(&p->mutex);
1155
1156             if (flags & MULTIFD_FLAG_SYNC) {
1157                 qemu_sem_post(&multifd_send_state->sem_sync);
1158             }
1159             qemu_sem_post(&multifd_send_state->channels_ready);
1160         } else if (p->quit) {
1161             qemu_mutex_unlock(&p->mutex);
1162             break;
1163         } else {
1164             qemu_mutex_unlock(&p->mutex);
1165             /* sometimes there are spurious wakeups */
1166         }
1167     }
1168
1169 out:
1170     if (local_err) {
1171         multifd_send_terminate_threads(local_err);
1172     }
1173
1174     /*
1175      * Error happen, I will exit, but I can't just leave, tell
1176      * who pay attention to me.
1177      */
1178     if (ret != 0) {
1179         if (flags & MULTIFD_FLAG_SYNC) {
1180             qemu_sem_post(&multifd_send_state->sem_sync);
1181         }
1182         qemu_sem_post(&multifd_send_state->channels_ready);
1183     }
1184
1185     qemu_mutex_lock(&p->mutex);
1186     p->running = false;
1187     qemu_mutex_unlock(&p->mutex);
1188
1189     rcu_unregister_thread();
1190     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1191
1192     return NULL;
1193 }
1194
1195 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1196 {
1197     MultiFDSendParams *p = opaque;
1198     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1199     Error *local_err = NULL;
1200
1201     if (qio_task_propagate_error(task, &local_err)) {
1202         migrate_set_error(migrate_get_current(), local_err);
1203         multifd_save_cleanup();
1204     } else {
1205         p->c = QIO_CHANNEL(sioc);
1206         qio_channel_set_delay(p->c, false);
1207         p->running = true;
1208         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1209                            QEMU_THREAD_JOINABLE);
1210     }
1211 }
1212
1213 int multifd_save_setup(void)
1214 {
1215     int thread_count;
1216     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1217     uint8_t i;
1218
1219     if (!migrate_use_multifd()) {
1220         return 0;
1221     }
1222     thread_count = migrate_multifd_channels();
1223     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1224     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1225     multifd_send_state->pages = multifd_pages_init(page_count);
1226     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1227     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1228
1229     for (i = 0; i < thread_count; i++) {
1230         MultiFDSendParams *p = &multifd_send_state->params[i];
1231
1232         qemu_mutex_init(&p->mutex);
1233         qemu_sem_init(&p->sem, 0);
1234         p->quit = false;
1235         p->pending_job = 0;
1236         p->id = i;
1237         p->pages = multifd_pages_init(page_count);
1238         p->packet_len = sizeof(MultiFDPacket_t)
1239                       + sizeof(ram_addr_t) * page_count;
1240         p->packet = g_malloc0(p->packet_len);
1241         p->name = g_strdup_printf("multifdsend_%d", i);
1242         socket_send_channel_create(multifd_new_send_channel_async, p);
1243     }
1244     return 0;
1245 }
1246
1247 struct {
1248     MultiFDRecvParams *params;
1249     /* number of created threads */
1250     int count;
1251     /* syncs main thread and channels */
1252     QemuSemaphore sem_sync;
1253     /* global number of generated multifd packets */
1254     uint64_t packet_num;
1255 } *multifd_recv_state;
1256
1257 static void multifd_recv_terminate_threads(Error *err)
1258 {
1259     int i;
1260
1261     if (err) {
1262         MigrationState *s = migrate_get_current();
1263         migrate_set_error(s, err);
1264         if (s->state == MIGRATION_STATUS_SETUP ||
1265             s->state == MIGRATION_STATUS_ACTIVE) {
1266             migrate_set_state(&s->state, s->state,
1267                               MIGRATION_STATUS_FAILED);
1268         }
1269     }
1270
1271     for (i = 0; i < migrate_multifd_channels(); i++) {
1272         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1273
1274         qemu_mutex_lock(&p->mutex);
1275         p->quit = true;
1276         /* We could arrive here for two reasons:
1277            - normal quit, i.e. everything went fine, just finished
1278            - error quit: We close the channels so the channel threads
1279              finish the qio_channel_read_all_eof() */
1280         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1281         qemu_mutex_unlock(&p->mutex);
1282     }
1283 }
1284
1285 int multifd_load_cleanup(Error **errp)
1286 {
1287     int i;
1288     int ret = 0;
1289
1290     if (!migrate_use_multifd()) {
1291         return 0;
1292     }
1293     multifd_recv_terminate_threads(NULL);
1294     for (i = 0; i < migrate_multifd_channels(); i++) {
1295         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1296
1297         if (p->running) {
1298             p->quit = true;
1299             /*
1300              * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1301              * however try to wakeup it without harm in cleanup phase.
1302              */
1303             qemu_sem_post(&p->sem_sync);
1304             qemu_thread_join(&p->thread);
1305         }
1306         object_unref(OBJECT(p->c));
1307         p->c = NULL;
1308         qemu_mutex_destroy(&p->mutex);
1309         qemu_sem_destroy(&p->sem_sync);
1310         g_free(p->name);
1311         p->name = NULL;
1312         multifd_pages_clear(p->pages);
1313         p->pages = NULL;
1314         p->packet_len = 0;
1315         g_free(p->packet);
1316         p->packet = NULL;
1317     }
1318     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1319     g_free(multifd_recv_state->params);
1320     multifd_recv_state->params = NULL;
1321     g_free(multifd_recv_state);
1322     multifd_recv_state = NULL;
1323
1324     return ret;
1325 }
1326
1327 static void multifd_recv_sync_main(void)
1328 {
1329     int i;
1330
1331     if (!migrate_use_multifd()) {
1332         return;
1333     }
1334     for (i = 0; i < migrate_multifd_channels(); i++) {
1335         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1336
1337         trace_multifd_recv_sync_main_wait(p->id);
1338         qemu_sem_wait(&multifd_recv_state->sem_sync);
1339     }
1340     for (i = 0; i < migrate_multifd_channels(); i++) {
1341         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1342
1343         qemu_mutex_lock(&p->mutex);
1344         if (multifd_recv_state->packet_num < p->packet_num) {
1345             multifd_recv_state->packet_num = p->packet_num;
1346         }
1347         qemu_mutex_unlock(&p->mutex);
1348         trace_multifd_recv_sync_main_signal(p->id);
1349         qemu_sem_post(&p->sem_sync);
1350     }
1351     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1352 }
1353
1354 static void *multifd_recv_thread(void *opaque)
1355 {
1356     MultiFDRecvParams *p = opaque;
1357     Error *local_err = NULL;
1358     int ret;
1359
1360     trace_multifd_recv_thread_start(p->id);
1361     rcu_register_thread();
1362
1363     while (true) {
1364         uint32_t used;
1365         uint32_t flags;
1366
1367         if (p->quit) {
1368             break;
1369         }
1370
1371         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1372                                        p->packet_len, &local_err);
1373         if (ret == 0) {   /* EOF */
1374             break;
1375         }
1376         if (ret == -1) {   /* Error */
1377             break;
1378         }
1379
1380         qemu_mutex_lock(&p->mutex);
1381         ret = multifd_recv_unfill_packet(p, &local_err);
1382         if (ret) {
1383             qemu_mutex_unlock(&p->mutex);
1384             break;
1385         }
1386
1387         used = p->pages->used;
1388         flags = p->flags;
1389         trace_multifd_recv(p->id, p->packet_num, used, flags,
1390                            p->next_packet_size);
1391         p->num_packets++;
1392         p->num_pages += used;
1393         qemu_mutex_unlock(&p->mutex);
1394
1395         if (used) {
1396             ret = qio_channel_readv_all(p->c, p->pages->iov,
1397                                         used, &local_err);
1398             if (ret != 0) {
1399                 break;
1400             }
1401         }
1402
1403         if (flags & MULTIFD_FLAG_SYNC) {
1404             qemu_sem_post(&multifd_recv_state->sem_sync);
1405             qemu_sem_wait(&p->sem_sync);
1406         }
1407     }
1408
1409     if (local_err) {
1410         multifd_recv_terminate_threads(local_err);
1411     }
1412     qemu_mutex_lock(&p->mutex);
1413     p->running = false;
1414     qemu_mutex_unlock(&p->mutex);
1415
1416     rcu_unregister_thread();
1417     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1418
1419     return NULL;
1420 }
1421
1422 int multifd_load_setup(void)
1423 {
1424     int thread_count;
1425     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1426     uint8_t i;
1427
1428     if (!migrate_use_multifd()) {
1429         return 0;
1430     }
1431     thread_count = migrate_multifd_channels();
1432     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1433     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1434     atomic_set(&multifd_recv_state->count, 0);
1435     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1436
1437     for (i = 0; i < thread_count; i++) {
1438         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1439
1440         qemu_mutex_init(&p->mutex);
1441         qemu_sem_init(&p->sem_sync, 0);
1442         p->quit = false;
1443         p->id = i;
1444         p->pages = multifd_pages_init(page_count);
1445         p->packet_len = sizeof(MultiFDPacket_t)
1446                       + sizeof(ram_addr_t) * page_count;
1447         p->packet = g_malloc0(p->packet_len);
1448         p->name = g_strdup_printf("multifdrecv_%d", i);
1449     }
1450     return 0;
1451 }
1452
1453 bool multifd_recv_all_channels_created(void)
1454 {
1455     int thread_count = migrate_multifd_channels();
1456
1457     if (!migrate_use_multifd()) {
1458         return true;
1459     }
1460
1461     return thread_count == atomic_read(&multifd_recv_state->count);
1462 }
1463
1464 /*
1465  * Try to receive all multifd channels to get ready for the migration.
1466  * - Return true and do not set @errp when correctly receving all channels;
1467  * - Return false and do not set @errp when correctly receiving the current one;
1468  * - Return false and set @errp when failing to receive the current channel.
1469  */
1470 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1471 {
1472     MultiFDRecvParams *p;
1473     Error *local_err = NULL;
1474     int id;
1475
1476     id = multifd_recv_initial_packet(ioc, &local_err);
1477     if (id < 0) {
1478         multifd_recv_terminate_threads(local_err);
1479         error_propagate_prepend(errp, local_err,
1480                                 "failed to receive packet"
1481                                 " via multifd channel %d: ",
1482                                 atomic_read(&multifd_recv_state->count));
1483         return false;
1484     }
1485
1486     p = &multifd_recv_state->params[id];
1487     if (p->c != NULL) {
1488         error_setg(&local_err, "multifd: received id '%d' already setup'",
1489                    id);
1490         multifd_recv_terminate_threads(local_err);
1491         error_propagate(errp, local_err);
1492         return false;
1493     }
1494     p->c = ioc;
1495     object_ref(OBJECT(ioc));
1496     /* initial packet */
1497     p->num_packets = 1;
1498
1499     p->running = true;
1500     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1501                        QEMU_THREAD_JOINABLE);
1502     atomic_inc(&multifd_recv_state->count);
1503     return atomic_read(&multifd_recv_state->count) ==
1504            migrate_multifd_channels();
1505 }
1506
1507 /**
1508  * save_page_header: write page header to wire
1509  *
1510  * If this is the 1st block, it also writes the block identification
1511  *
1512  * Returns the number of bytes written
1513  *
1514  * @f: QEMUFile where to send the data
1515  * @block: block that contains the page we want to send
1516  * @offset: offset inside the block for the page
1517  *          in the lower bits, it contains flags
1518  */
1519 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1520                                ram_addr_t offset)
1521 {
1522     size_t size, len;
1523
1524     if (block == rs->last_sent_block) {
1525         offset |= RAM_SAVE_FLAG_CONTINUE;
1526     }
1527     qemu_put_be64(f, offset);
1528     size = 8;
1529
1530     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1531         len = strlen(block->idstr);
1532         qemu_put_byte(f, len);
1533         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1534         size += 1 + len;
1535         rs->last_sent_block = block;
1536     }
1537     return size;
1538 }
1539
1540 /**
1541  * mig_throttle_guest_down: throotle down the guest
1542  *
1543  * Reduce amount of guest cpu execution to hopefully slow down memory
1544  * writes. If guest dirty memory rate is reduced below the rate at
1545  * which we can transfer pages to the destination then we should be
1546  * able to complete migration. Some workloads dirty memory way too
1547  * fast and will not effectively converge, even with auto-converge.
1548  */
1549 static void mig_throttle_guest_down(void)
1550 {
1551     MigrationState *s = migrate_get_current();
1552     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1553     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1554     int pct_max = s->parameters.max_cpu_throttle;
1555
1556     /* We have not started throttling yet. Let's start it. */
1557     if (!cpu_throttle_active()) {
1558         cpu_throttle_set(pct_initial);
1559     } else {
1560         /* Throttling already on, just increase the rate */
1561         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1562                          pct_max));
1563     }
1564 }
1565
1566 /**
1567  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1568  *
1569  * @rs: current RAM state
1570  * @current_addr: address for the zero page
1571  *
1572  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1573  * The important thing is that a stale (not-yet-0'd) page be replaced
1574  * by the new data.
1575  * As a bonus, if the page wasn't in the cache it gets added so that
1576  * when a small write is made into the 0'd page it gets XBZRLE sent.
1577  */
1578 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1579 {
1580     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1581         return;
1582     }
1583
1584     /* We don't care if this fails to allocate a new cache page
1585      * as long as it updated an old one */
1586     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1587                  ram_counters.dirty_sync_count);
1588 }
1589
1590 #define ENCODING_FLAG_XBZRLE 0x1
1591
1592 /**
1593  * save_xbzrle_page: compress and send current page
1594  *
1595  * Returns: 1 means that we wrote the page
1596  *          0 means that page is identical to the one already sent
1597  *          -1 means that xbzrle would be longer than normal
1598  *
1599  * @rs: current RAM state
1600  * @current_data: pointer to the address of the page contents
1601  * @current_addr: addr of the page
1602  * @block: block that contains the page we want to send
1603  * @offset: offset inside the block for the page
1604  * @last_stage: if we are at the completion stage
1605  */
1606 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1607                             ram_addr_t current_addr, RAMBlock *block,
1608                             ram_addr_t offset, bool last_stage)
1609 {
1610     int encoded_len = 0, bytes_xbzrle;
1611     uint8_t *prev_cached_page;
1612
1613     if (!cache_is_cached(XBZRLE.cache, current_addr,
1614                          ram_counters.dirty_sync_count)) {
1615         xbzrle_counters.cache_miss++;
1616         if (!last_stage) {
1617             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1618                              ram_counters.dirty_sync_count) == -1) {
1619                 return -1;
1620             } else {
1621                 /* update *current_data when the page has been
1622                    inserted into cache */
1623                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1624             }
1625         }
1626         return -1;
1627     }
1628
1629     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1630
1631     /* save current buffer into memory */
1632     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1633
1634     /* XBZRLE encoding (if there is no overflow) */
1635     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1636                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1637                                        TARGET_PAGE_SIZE);
1638
1639     /*
1640      * Update the cache contents, so that it corresponds to the data
1641      * sent, in all cases except where we skip the page.
1642      */
1643     if (!last_stage && encoded_len != 0) {
1644         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1645         /*
1646          * In the case where we couldn't compress, ensure that the caller
1647          * sends the data from the cache, since the guest might have
1648          * changed the RAM since we copied it.
1649          */
1650         *current_data = prev_cached_page;
1651     }
1652
1653     if (encoded_len == 0) {
1654         trace_save_xbzrle_page_skipping();
1655         return 0;
1656     } else if (encoded_len == -1) {
1657         trace_save_xbzrle_page_overflow();
1658         xbzrle_counters.overflow++;
1659         return -1;
1660     }
1661
1662     /* Send XBZRLE based compressed page */
1663     bytes_xbzrle = save_page_header(rs, rs->f, block,
1664                                     offset | RAM_SAVE_FLAG_XBZRLE);
1665     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1666     qemu_put_be16(rs->f, encoded_len);
1667     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1668     bytes_xbzrle += encoded_len + 1 + 2;
1669     xbzrle_counters.pages++;
1670     xbzrle_counters.bytes += bytes_xbzrle;
1671     ram_counters.transferred += bytes_xbzrle;
1672
1673     return 1;
1674 }
1675
1676 /**
1677  * migration_bitmap_find_dirty: find the next dirty page from start
1678  *
1679  * Returns the page offset within memory region of the start of a dirty page
1680  *
1681  * @rs: current RAM state
1682  * @rb: RAMBlock where to search for dirty pages
1683  * @start: page where we start the search
1684  */
1685 static inline
1686 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1687                                           unsigned long start)
1688 {
1689     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1690     unsigned long *bitmap = rb->bmap;
1691     unsigned long next;
1692
1693     if (ramblock_is_ignored(rb)) {
1694         return size;
1695     }
1696
1697     /*
1698      * When the free page optimization is enabled, we need to check the bitmap
1699      * to send the non-free pages rather than all the pages in the bulk stage.
1700      */
1701     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1702         next = start + 1;
1703     } else {
1704         next = find_next_bit(bitmap, size, start);
1705     }
1706
1707     return next;
1708 }
1709
1710 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1711                                                 RAMBlock *rb,
1712                                                 unsigned long page)
1713 {
1714     bool ret;
1715
1716     qemu_mutex_lock(&rs->bitmap_mutex);
1717
1718     /*
1719      * Clear dirty bitmap if needed.  This _must_ be called before we
1720      * send any of the page in the chunk because we need to make sure
1721      * we can capture further page content changes when we sync dirty
1722      * log the next time.  So as long as we are going to send any of
1723      * the page in the chunk we clear the remote dirty bitmap for all.
1724      * Clearing it earlier won't be a problem, but too late will.
1725      */
1726     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1727         uint8_t shift = rb->clear_bmap_shift;
1728         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1729         hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1730
1731         /*
1732          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1733          * can make things easier sometimes since then start address
1734          * of the small chunk will always be 64 pages aligned so the
1735          * bitmap will always be aligned to unsigned long.  We should
1736          * even be able to remove this restriction but I'm simply
1737          * keeping it.
1738          */
1739         assert(shift >= 6);
1740         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1741         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1742     }
1743
1744     ret = test_and_clear_bit(page, rb->bmap);
1745
1746     if (ret) {
1747         rs->migration_dirty_pages--;
1748     }
1749     qemu_mutex_unlock(&rs->bitmap_mutex);
1750
1751     return ret;
1752 }
1753
1754 /* Called with RCU critical section */
1755 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1756 {
1757     rs->migration_dirty_pages +=
1758         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
1759                                               &rs->num_dirty_pages_period);
1760 }
1761
1762 /**
1763  * ram_pagesize_summary: calculate all the pagesizes of a VM
1764  *
1765  * Returns a summary bitmap of the page sizes of all RAMBlocks
1766  *
1767  * For VMs with just normal pages this is equivalent to the host page
1768  * size. If it's got some huge pages then it's the OR of all the
1769  * different page sizes.
1770  */
1771 uint64_t ram_pagesize_summary(void)
1772 {
1773     RAMBlock *block;
1774     uint64_t summary = 0;
1775
1776     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1777         summary |= block->page_size;
1778     }
1779
1780     return summary;
1781 }
1782
1783 uint64_t ram_get_total_transferred_pages(void)
1784 {
1785     return  ram_counters.normal + ram_counters.duplicate +
1786                 compression_counters.pages + xbzrle_counters.pages;
1787 }
1788
1789 static void migration_update_rates(RAMState *rs, int64_t end_time)
1790 {
1791     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1792     double compressed_size;
1793
1794     /* calculate period counters */
1795     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1796                 / (end_time - rs->time_last_bitmap_sync);
1797
1798     if (!page_count) {
1799         return;
1800     }
1801
1802     if (migrate_use_xbzrle()) {
1803         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1804             rs->xbzrle_cache_miss_prev) / page_count;
1805         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1806     }
1807
1808     if (migrate_use_compression()) {
1809         compression_counters.busy_rate = (double)(compression_counters.busy -
1810             rs->compress_thread_busy_prev) / page_count;
1811         rs->compress_thread_busy_prev = compression_counters.busy;
1812
1813         compressed_size = compression_counters.compressed_size -
1814                           rs->compressed_size_prev;
1815         if (compressed_size) {
1816             double uncompressed_size = (compression_counters.pages -
1817                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1818
1819             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1820             compression_counters.compression_rate =
1821                                         uncompressed_size / compressed_size;
1822
1823             rs->compress_pages_prev = compression_counters.pages;
1824             rs->compressed_size_prev = compression_counters.compressed_size;
1825         }
1826     }
1827 }
1828
1829 static void migration_bitmap_sync(RAMState *rs)
1830 {
1831     RAMBlock *block;
1832     int64_t end_time;
1833     uint64_t bytes_xfer_now;
1834
1835     ram_counters.dirty_sync_count++;
1836
1837     if (!rs->time_last_bitmap_sync) {
1838         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1839     }
1840
1841     trace_migration_bitmap_sync_start();
1842     memory_global_dirty_log_sync();
1843
1844     qemu_mutex_lock(&rs->bitmap_mutex);
1845     rcu_read_lock();
1846     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1847         ramblock_sync_dirty_bitmap(rs, block);
1848     }
1849     ram_counters.remaining = ram_bytes_remaining();
1850     rcu_read_unlock();
1851     qemu_mutex_unlock(&rs->bitmap_mutex);
1852
1853     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1854
1855     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1856
1857     /* more than 1 second = 1000 millisecons */
1858     if (end_time > rs->time_last_bitmap_sync + 1000) {
1859         bytes_xfer_now = ram_counters.transferred;
1860
1861         /* During block migration the auto-converge logic incorrectly detects
1862          * that ram migration makes no progress. Avoid this by disabling the
1863          * throttling logic during the bulk phase of block migration. */
1864         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1865             /* The following detection logic can be refined later. For now:
1866                Check to see if the dirtied bytes is 50% more than the approx.
1867                amount of bytes that just got transferred since the last time we
1868                were in this routine. If that happens twice, start or increase
1869                throttling */
1870
1871             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1872                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1873                 (++rs->dirty_rate_high_cnt >= 2)) {
1874                     trace_migration_throttle();
1875                     rs->dirty_rate_high_cnt = 0;
1876                     mig_throttle_guest_down();
1877             }
1878         }
1879
1880         migration_update_rates(rs, end_time);
1881
1882         rs->target_page_count_prev = rs->target_page_count;
1883
1884         /* reset period counters */
1885         rs->time_last_bitmap_sync = end_time;
1886         rs->num_dirty_pages_period = 0;
1887         rs->bytes_xfer_prev = bytes_xfer_now;
1888     }
1889     if (migrate_use_events()) {
1890         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1891     }
1892 }
1893
1894 static void migration_bitmap_sync_precopy(RAMState *rs)
1895 {
1896     Error *local_err = NULL;
1897
1898     /*
1899      * The current notifier usage is just an optimization to migration, so we
1900      * don't stop the normal migration process in the error case.
1901      */
1902     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1903         error_report_err(local_err);
1904     }
1905
1906     migration_bitmap_sync(rs);
1907
1908     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1909         error_report_err(local_err);
1910     }
1911 }
1912
1913 /**
1914  * save_zero_page_to_file: send the zero page to the file
1915  *
1916  * Returns the size of data written to the file, 0 means the page is not
1917  * a zero page
1918  *
1919  * @rs: current RAM state
1920  * @file: the file where the data is saved
1921  * @block: block that contains the page we want to send
1922  * @offset: offset inside the block for the page
1923  */
1924 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1925                                   RAMBlock *block, ram_addr_t offset)
1926 {
1927     uint8_t *p = block->host + offset;
1928     int len = 0;
1929
1930     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1931         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1932         qemu_put_byte(file, 0);
1933         len += 1;
1934     }
1935     return len;
1936 }
1937
1938 /**
1939  * save_zero_page: send the zero page to the stream
1940  *
1941  * Returns the number of pages written.
1942  *
1943  * @rs: current RAM state
1944  * @block: block that contains the page we want to send
1945  * @offset: offset inside the block for the page
1946  */
1947 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1948 {
1949     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1950
1951     if (len) {
1952         ram_counters.duplicate++;
1953         ram_counters.transferred += len;
1954         return 1;
1955     }
1956     return -1;
1957 }
1958
1959 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1960 {
1961     if (!migrate_release_ram() || !migration_in_postcopy()) {
1962         return;
1963     }
1964
1965     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1966 }
1967
1968 /*
1969  * @pages: the number of pages written by the control path,
1970  *        < 0 - error
1971  *        > 0 - number of pages written
1972  *
1973  * Return true if the pages has been saved, otherwise false is returned.
1974  */
1975 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1976                               int *pages)
1977 {
1978     uint64_t bytes_xmit = 0;
1979     int ret;
1980
1981     *pages = -1;
1982     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1983                                 &bytes_xmit);
1984     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1985         return false;
1986     }
1987
1988     if (bytes_xmit) {
1989         ram_counters.transferred += bytes_xmit;
1990         *pages = 1;
1991     }
1992
1993     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1994         return true;
1995     }
1996
1997     if (bytes_xmit > 0) {
1998         ram_counters.normal++;
1999     } else if (bytes_xmit == 0) {
2000         ram_counters.duplicate++;
2001     }
2002
2003     return true;
2004 }
2005
2006 /*
2007  * directly send the page to the stream
2008  *
2009  * Returns the number of pages written.
2010  *
2011  * @rs: current RAM state
2012  * @block: block that contains the page we want to send
2013  * @offset: offset inside the block for the page
2014  * @buf: the page to be sent
2015  * @async: send to page asyncly
2016  */
2017 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2018                             uint8_t *buf, bool async)
2019 {
2020     ram_counters.transferred += save_page_header(rs, rs->f, block,
2021                                                  offset | RAM_SAVE_FLAG_PAGE);
2022     if (async) {
2023         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2024                               migrate_release_ram() &
2025                               migration_in_postcopy());
2026     } else {
2027         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2028     }
2029     ram_counters.transferred += TARGET_PAGE_SIZE;
2030     ram_counters.normal++;
2031     return 1;
2032 }
2033
2034 /**
2035  * ram_save_page: send the given page to the stream
2036  *
2037  * Returns the number of pages written.
2038  *          < 0 - error
2039  *          >=0 - Number of pages written - this might legally be 0
2040  *                if xbzrle noticed the page was the same.
2041  *
2042  * @rs: current RAM state
2043  * @block: block that contains the page we want to send
2044  * @offset: offset inside the block for the page
2045  * @last_stage: if we are at the completion stage
2046  */
2047 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
2048 {
2049     int pages = -1;
2050     uint8_t *p;
2051     bool send_async = true;
2052     RAMBlock *block = pss->block;
2053     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2054     ram_addr_t current_addr = block->offset + offset;
2055
2056     p = block->host + offset;
2057     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2058
2059     XBZRLE_cache_lock();
2060     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2061         migrate_use_xbzrle()) {
2062         pages = save_xbzrle_page(rs, &p, current_addr, block,
2063                                  offset, last_stage);
2064         if (!last_stage) {
2065             /* Can't send this cached data async, since the cache page
2066              * might get updated before it gets to the wire
2067              */
2068             send_async = false;
2069         }
2070     }
2071
2072     /* XBZRLE overflow or normal page */
2073     if (pages == -1) {
2074         pages = save_normal_page(rs, block, offset, p, send_async);
2075     }
2076
2077     XBZRLE_cache_unlock();
2078
2079     return pages;
2080 }
2081
2082 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2083                                  ram_addr_t offset)
2084 {
2085     if (multifd_queue_page(rs, block, offset) < 0) {
2086         return -1;
2087     }
2088     ram_counters.normal++;
2089
2090     return 1;
2091 }
2092
2093 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2094                                  ram_addr_t offset, uint8_t *source_buf)
2095 {
2096     RAMState *rs = ram_state;
2097     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2098     bool zero_page = false;
2099     int ret;
2100
2101     if (save_zero_page_to_file(rs, f, block, offset)) {
2102         zero_page = true;
2103         goto exit;
2104     }
2105
2106     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2107
2108     /*
2109      * copy it to a internal buffer to avoid it being modified by VM
2110      * so that we can catch up the error during compression and
2111      * decompression
2112      */
2113     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2114     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2115     if (ret < 0) {
2116         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2117         error_report("compressed data failed!");
2118         return false;
2119     }
2120
2121 exit:
2122     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2123     return zero_page;
2124 }
2125
2126 static void
2127 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2128 {
2129     ram_counters.transferred += bytes_xmit;
2130
2131     if (param->zero_page) {
2132         ram_counters.duplicate++;
2133         return;
2134     }
2135
2136     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2137     compression_counters.compressed_size += bytes_xmit - 8;
2138     compression_counters.pages++;
2139 }
2140
2141 static bool save_page_use_compression(RAMState *rs);
2142
2143 static void flush_compressed_data(RAMState *rs)
2144 {
2145     int idx, len, thread_count;
2146
2147     if (!save_page_use_compression(rs)) {
2148         return;
2149     }
2150     thread_count = migrate_compress_threads();
2151
2152     qemu_mutex_lock(&comp_done_lock);
2153     for (idx = 0; idx < thread_count; idx++) {
2154         while (!comp_param[idx].done) {
2155             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2156         }
2157     }
2158     qemu_mutex_unlock(&comp_done_lock);
2159
2160     for (idx = 0; idx < thread_count; idx++) {
2161         qemu_mutex_lock(&comp_param[idx].mutex);
2162         if (!comp_param[idx].quit) {
2163             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2164             /*
2165              * it's safe to fetch zero_page without holding comp_done_lock
2166              * as there is no further request submitted to the thread,
2167              * i.e, the thread should be waiting for a request at this point.
2168              */
2169             update_compress_thread_counts(&comp_param[idx], len);
2170         }
2171         qemu_mutex_unlock(&comp_param[idx].mutex);
2172     }
2173 }
2174
2175 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2176                                        ram_addr_t offset)
2177 {
2178     param->block = block;
2179     param->offset = offset;
2180 }
2181
2182 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2183                                            ram_addr_t offset)
2184 {
2185     int idx, thread_count, bytes_xmit = -1, pages = -1;
2186     bool wait = migrate_compress_wait_thread();
2187
2188     thread_count = migrate_compress_threads();
2189     qemu_mutex_lock(&comp_done_lock);
2190 retry:
2191     for (idx = 0; idx < thread_count; idx++) {
2192         if (comp_param[idx].done) {
2193             comp_param[idx].done = false;
2194             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2195             qemu_mutex_lock(&comp_param[idx].mutex);
2196             set_compress_params(&comp_param[idx], block, offset);
2197             qemu_cond_signal(&comp_param[idx].cond);
2198             qemu_mutex_unlock(&comp_param[idx].mutex);
2199             pages = 1;
2200             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2201             break;
2202         }
2203     }
2204
2205     /*
2206      * wait for the free thread if the user specifies 'compress-wait-thread',
2207      * otherwise we will post the page out in the main thread as normal page.
2208      */
2209     if (pages < 0 && wait) {
2210         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2211         goto retry;
2212     }
2213     qemu_mutex_unlock(&comp_done_lock);
2214
2215     return pages;
2216 }
2217
2218 /**
2219  * find_dirty_block: find the next dirty page and update any state
2220  * associated with the search process.
2221  *
2222  * Returns true if a page is found
2223  *
2224  * @rs: current RAM state
2225  * @pss: data about the state of the current dirty page scan
2226  * @again: set to false if the search has scanned the whole of RAM
2227  */
2228 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2229 {
2230     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2231     if (pss->complete_round && pss->block == rs->last_seen_block &&
2232         pss->page >= rs->last_page) {
2233         /*
2234          * We've been once around the RAM and haven't found anything.
2235          * Give up.
2236          */
2237         *again = false;
2238         return false;
2239     }
2240     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2241         /* Didn't find anything in this RAM Block */
2242         pss->page = 0;
2243         pss->block = QLIST_NEXT_RCU(pss->block, next);
2244         if (!pss->block) {
2245             /*
2246              * If memory migration starts over, we will meet a dirtied page
2247              * which may still exists in compression threads's ring, so we
2248              * should flush the compressed data to make sure the new page
2249              * is not overwritten by the old one in the destination.
2250              *
2251              * Also If xbzrle is on, stop using the data compression at this
2252              * point. In theory, xbzrle can do better than compression.
2253              */
2254             flush_compressed_data(rs);
2255
2256             /* Hit the end of the list */
2257             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2258             /* Flag that we've looped */
2259             pss->complete_round = true;
2260             rs->ram_bulk_stage = false;
2261         }
2262         /* Didn't find anything this time, but try again on the new block */
2263         *again = true;
2264         return false;
2265     } else {
2266         /* Can go around again, but... */
2267         *again = true;
2268         /* We've found something so probably don't need to */
2269         return true;
2270     }
2271 }
2272
2273 /**
2274  * unqueue_page: gets a page of the queue
2275  *
2276  * Helper for 'get_queued_page' - gets a page off the queue
2277  *
2278  * Returns the block of the page (or NULL if none available)
2279  *
2280  * @rs: current RAM state
2281  * @offset: used to return the offset within the RAMBlock
2282  */
2283 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2284 {
2285     RAMBlock *block = NULL;
2286
2287     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2288         return NULL;
2289     }
2290
2291     qemu_mutex_lock(&rs->src_page_req_mutex);
2292     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2293         struct RAMSrcPageRequest *entry =
2294                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2295         block = entry->rb;
2296         *offset = entry->offset;
2297
2298         if (entry->len > TARGET_PAGE_SIZE) {
2299             entry->len -= TARGET_PAGE_SIZE;
2300             entry->offset += TARGET_PAGE_SIZE;
2301         } else {
2302             memory_region_unref(block->mr);
2303             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2304             g_free(entry);
2305             migration_consume_urgent_request();
2306         }
2307     }
2308     qemu_mutex_unlock(&rs->src_page_req_mutex);
2309
2310     return block;
2311 }
2312
2313 /**
2314  * get_queued_page: unqueue a page from the postcopy requests
2315  *
2316  * Skips pages that are already sent (!dirty)
2317  *
2318  * Returns true if a queued page is found
2319  *
2320  * @rs: current RAM state
2321  * @pss: data about the state of the current dirty page scan
2322  */
2323 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2324 {
2325     RAMBlock  *block;
2326     ram_addr_t offset;
2327     bool dirty;
2328
2329     do {
2330         block = unqueue_page(rs, &offset);
2331         /*
2332          * We're sending this page, and since it's postcopy nothing else
2333          * will dirty it, and we must make sure it doesn't get sent again
2334          * even if this queue request was received after the background
2335          * search already sent it.
2336          */
2337         if (block) {
2338             unsigned long page;
2339
2340             page = offset >> TARGET_PAGE_BITS;
2341             dirty = test_bit(page, block->bmap);
2342             if (!dirty) {
2343                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2344                        page, test_bit(page, block->unsentmap));
2345             } else {
2346                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2347             }
2348         }
2349
2350     } while (block && !dirty);
2351
2352     if (block) {
2353         /*
2354          * As soon as we start servicing pages out of order, then we have
2355          * to kill the bulk stage, since the bulk stage assumes
2356          * in (migration_bitmap_find_and_reset_dirty) that every page is
2357          * dirty, that's no longer true.
2358          */
2359         rs->ram_bulk_stage = false;
2360
2361         /*
2362          * We want the background search to continue from the queued page
2363          * since the guest is likely to want other pages near to the page
2364          * it just requested.
2365          */
2366         pss->block = block;
2367         pss->page = offset >> TARGET_PAGE_BITS;
2368
2369         /*
2370          * This unqueued page would break the "one round" check, even is
2371          * really rare.
2372          */
2373         pss->complete_round = false;
2374     }
2375
2376     return !!block;
2377 }
2378
2379 /**
2380  * migration_page_queue_free: drop any remaining pages in the ram
2381  * request queue
2382  *
2383  * It should be empty at the end anyway, but in error cases there may
2384  * be some left.  in case that there is any page left, we drop it.
2385  *
2386  */
2387 static void migration_page_queue_free(RAMState *rs)
2388 {
2389     struct RAMSrcPageRequest *mspr, *next_mspr;
2390     /* This queue generally should be empty - but in the case of a failed
2391      * migration might have some droppings in.
2392      */
2393     rcu_read_lock();
2394     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2395         memory_region_unref(mspr->rb->mr);
2396         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2397         g_free(mspr);
2398     }
2399     rcu_read_unlock();
2400 }
2401
2402 /**
2403  * ram_save_queue_pages: queue the page for transmission
2404  *
2405  * A request from postcopy destination for example.
2406  *
2407  * Returns zero on success or negative on error
2408  *
2409  * @rbname: Name of the RAMBLock of the request. NULL means the
2410  *          same that last one.
2411  * @start: starting address from the start of the RAMBlock
2412  * @len: length (in bytes) to send
2413  */
2414 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2415 {
2416     RAMBlock *ramblock;
2417     RAMState *rs = ram_state;
2418
2419     ram_counters.postcopy_requests++;
2420     rcu_read_lock();
2421     if (!rbname) {
2422         /* Reuse last RAMBlock */
2423         ramblock = rs->last_req_rb;
2424
2425         if (!ramblock) {
2426             /*
2427              * Shouldn't happen, we can't reuse the last RAMBlock if
2428              * it's the 1st request.
2429              */
2430             error_report("ram_save_queue_pages no previous block");
2431             goto err;
2432         }
2433     } else {
2434         ramblock = qemu_ram_block_by_name(rbname);
2435
2436         if (!ramblock) {
2437             /* We shouldn't be asked for a non-existent RAMBlock */
2438             error_report("ram_save_queue_pages no block '%s'", rbname);
2439             goto err;
2440         }
2441         rs->last_req_rb = ramblock;
2442     }
2443     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2444     if (start+len > ramblock->used_length) {
2445         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2446                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2447                      __func__, start, len, ramblock->used_length);
2448         goto err;
2449     }
2450
2451     struct RAMSrcPageRequest *new_entry =
2452         g_malloc0(sizeof(struct RAMSrcPageRequest));
2453     new_entry->rb = ramblock;
2454     new_entry->offset = start;
2455     new_entry->len = len;
2456
2457     memory_region_ref(ramblock->mr);
2458     qemu_mutex_lock(&rs->src_page_req_mutex);
2459     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2460     migration_make_urgent_request();
2461     qemu_mutex_unlock(&rs->src_page_req_mutex);
2462     rcu_read_unlock();
2463
2464     return 0;
2465
2466 err:
2467     rcu_read_unlock();
2468     return -1;
2469 }
2470
2471 static bool save_page_use_compression(RAMState *rs)
2472 {
2473     if (!migrate_use_compression()) {
2474         return false;
2475     }
2476
2477     /*
2478      * If xbzrle is on, stop using the data compression after first
2479      * round of migration even if compression is enabled. In theory,
2480      * xbzrle can do better than compression.
2481      */
2482     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2483         return true;
2484     }
2485
2486     return false;
2487 }
2488
2489 /*
2490  * try to compress the page before posting it out, return true if the page
2491  * has been properly handled by compression, otherwise needs other
2492  * paths to handle it
2493  */
2494 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2495 {
2496     if (!save_page_use_compression(rs)) {
2497         return false;
2498     }
2499
2500     /*
2501      * When starting the process of a new block, the first page of
2502      * the block should be sent out before other pages in the same
2503      * block, and all the pages in last block should have been sent
2504      * out, keeping this order is important, because the 'cont' flag
2505      * is used to avoid resending the block name.
2506      *
2507      * We post the fist page as normal page as compression will take
2508      * much CPU resource.
2509      */
2510     if (block != rs->last_sent_block) {
2511         flush_compressed_data(rs);
2512         return false;
2513     }
2514
2515     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2516         return true;
2517     }
2518
2519     compression_counters.busy++;
2520     return false;
2521 }
2522
2523 /**
2524  * ram_save_target_page: save one target page
2525  *
2526  * Returns the number of pages written
2527  *
2528  * @rs: current RAM state
2529  * @pss: data about the page we want to send
2530  * @last_stage: if we are at the completion stage
2531  */
2532 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2533                                 bool last_stage)
2534 {
2535     RAMBlock *block = pss->block;
2536     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2537     int res;
2538
2539     if (control_save_page(rs, block, offset, &res)) {
2540         return res;
2541     }
2542
2543     if (save_compress_page(rs, block, offset)) {
2544         return 1;
2545     }
2546
2547     res = save_zero_page(rs, block, offset);
2548     if (res > 0) {
2549         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2550          * page would be stale
2551          */
2552         if (!save_page_use_compression(rs)) {
2553             XBZRLE_cache_lock();
2554             xbzrle_cache_zero_page(rs, block->offset + offset);
2555             XBZRLE_cache_unlock();
2556         }
2557         ram_release_pages(block->idstr, offset, res);
2558         return res;
2559     }
2560
2561     /*
2562      * do not use multifd for compression as the first page in the new
2563      * block should be posted out before sending the compressed page
2564      */
2565     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2566         return ram_save_multifd_page(rs, block, offset);
2567     }
2568
2569     return ram_save_page(rs, pss, last_stage);
2570 }
2571
2572 /**
2573  * ram_save_host_page: save a whole host page
2574  *
2575  * Starting at *offset send pages up to the end of the current host
2576  * page. It's valid for the initial offset to point into the middle of
2577  * a host page in which case the remainder of the hostpage is sent.
2578  * Only dirty target pages are sent. Note that the host page size may
2579  * be a huge page for this block.
2580  * The saving stops at the boundary of the used_length of the block
2581  * if the RAMBlock isn't a multiple of the host page size.
2582  *
2583  * Returns the number of pages written or negative on error
2584  *
2585  * @rs: current RAM state
2586  * @ms: current migration state
2587  * @pss: data about the page we want to send
2588  * @last_stage: if we are at the completion stage
2589  */
2590 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2591                               bool last_stage)
2592 {
2593     int tmppages, pages = 0;
2594     size_t pagesize_bits =
2595         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2596
2597     if (ramblock_is_ignored(pss->block)) {
2598         error_report("block %s should not be migrated !", pss->block->idstr);
2599         return 0;
2600     }
2601
2602     do {
2603         /* Check the pages is dirty and if it is send it */
2604         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2605             pss->page++;
2606             continue;
2607         }
2608
2609         tmppages = ram_save_target_page(rs, pss, last_stage);
2610         if (tmppages < 0) {
2611             return tmppages;
2612         }
2613
2614         pages += tmppages;
2615         if (pss->block->unsentmap) {
2616             clear_bit(pss->page, pss->block->unsentmap);
2617         }
2618
2619         pss->page++;
2620     } while ((pss->page & (pagesize_bits - 1)) &&
2621              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2622
2623     /* The offset we leave with is the last one we looked at */
2624     pss->page--;
2625     return pages;
2626 }
2627
2628 /**
2629  * ram_find_and_save_block: finds a dirty page and sends it to f
2630  *
2631  * Called within an RCU critical section.
2632  *
2633  * Returns the number of pages written where zero means no dirty pages,
2634  * or negative on error
2635  *
2636  * @rs: current RAM state
2637  * @last_stage: if we are at the completion stage
2638  *
2639  * On systems where host-page-size > target-page-size it will send all the
2640  * pages in a host page that are dirty.
2641  */
2642
2643 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2644 {
2645     PageSearchStatus pss;
2646     int pages = 0;
2647     bool again, found;
2648
2649     /* No dirty page as there is zero RAM */
2650     if (!ram_bytes_total()) {
2651         return pages;
2652     }
2653
2654     pss.block = rs->last_seen_block;
2655     pss.page = rs->last_page;
2656     pss.complete_round = false;
2657
2658     if (!pss.block) {
2659         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2660     }
2661
2662     do {
2663         again = true;
2664         found = get_queued_page(rs, &pss);
2665
2666         if (!found) {
2667             /* priority queue empty, so just search for something dirty */
2668             found = find_dirty_block(rs, &pss, &again);
2669         }
2670
2671         if (found) {
2672             pages = ram_save_host_page(rs, &pss, last_stage);
2673         }
2674     } while (!pages && again);
2675
2676     rs->last_seen_block = pss.block;
2677     rs->last_page = pss.page;
2678
2679     return pages;
2680 }
2681
2682 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2683 {
2684     uint64_t pages = size / TARGET_PAGE_SIZE;
2685
2686     if (zero) {
2687         ram_counters.duplicate += pages;
2688     } else {
2689         ram_counters.normal += pages;
2690         ram_counters.transferred += size;
2691         qemu_update_position(f, size);
2692     }
2693 }
2694
2695 static uint64_t ram_bytes_total_common(bool count_ignored)
2696 {
2697     RAMBlock *block;
2698     uint64_t total = 0;
2699
2700     rcu_read_lock();
2701     if (count_ignored) {
2702         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2703             total += block->used_length;
2704         }
2705     } else {
2706         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2707             total += block->used_length;
2708         }
2709     }
2710     rcu_read_unlock();
2711     return total;
2712 }
2713
2714 uint64_t ram_bytes_total(void)
2715 {
2716     return ram_bytes_total_common(false);
2717 }
2718
2719 static void xbzrle_load_setup(void)
2720 {
2721     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2722 }
2723
2724 static void xbzrle_load_cleanup(void)
2725 {
2726     g_free(XBZRLE.decoded_buf);
2727     XBZRLE.decoded_buf = NULL;
2728 }
2729
2730 static void ram_state_cleanup(RAMState **rsp)
2731 {
2732     if (*rsp) {
2733         migration_page_queue_free(*rsp);
2734         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2735         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2736         g_free(*rsp);
2737         *rsp = NULL;
2738     }
2739 }
2740
2741 static void xbzrle_cleanup(void)
2742 {
2743     XBZRLE_cache_lock();
2744     if (XBZRLE.cache) {
2745         cache_fini(XBZRLE.cache);
2746         g_free(XBZRLE.encoded_buf);
2747         g_free(XBZRLE.current_buf);
2748         g_free(XBZRLE.zero_target_page);
2749         XBZRLE.cache = NULL;
2750         XBZRLE.encoded_buf = NULL;
2751         XBZRLE.current_buf = NULL;
2752         XBZRLE.zero_target_page = NULL;
2753     }
2754     XBZRLE_cache_unlock();
2755 }
2756
2757 static void ram_save_cleanup(void *opaque)
2758 {
2759     RAMState **rsp = opaque;
2760     RAMBlock *block;
2761
2762     /* caller have hold iothread lock or is in a bh, so there is
2763      * no writing race against the migration bitmap
2764      */
2765     memory_global_dirty_log_stop();
2766
2767     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2768         g_free(block->clear_bmap);
2769         block->clear_bmap = NULL;
2770         g_free(block->bmap);
2771         block->bmap = NULL;
2772         g_free(block->unsentmap);
2773         block->unsentmap = NULL;
2774     }
2775
2776     xbzrle_cleanup();
2777     compress_threads_save_cleanup();
2778     ram_state_cleanup(rsp);
2779 }
2780
2781 static void ram_state_reset(RAMState *rs)
2782 {
2783     rs->last_seen_block = NULL;
2784     rs->last_sent_block = NULL;
2785     rs->last_page = 0;
2786     rs->last_version = ram_list.version;
2787     rs->ram_bulk_stage = true;
2788     rs->fpo_enabled = false;
2789 }
2790
2791 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2792
2793 /*
2794  * 'expected' is the value you expect the bitmap mostly to be full
2795  * of; it won't bother printing lines that are all this value.
2796  * If 'todump' is null the migration bitmap is dumped.
2797  */
2798 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2799                            unsigned long pages)
2800 {
2801     int64_t cur;
2802     int64_t linelen = 128;
2803     char linebuf[129];
2804
2805     for (cur = 0; cur < pages; cur += linelen) {
2806         int64_t curb;
2807         bool found = false;
2808         /*
2809          * Last line; catch the case where the line length
2810          * is longer than remaining ram
2811          */
2812         if (cur + linelen > pages) {
2813             linelen = pages - cur;
2814         }
2815         for (curb = 0; curb < linelen; curb++) {
2816             bool thisbit = test_bit(cur + curb, todump);
2817             linebuf[curb] = thisbit ? '1' : '.';
2818             found = found || (thisbit != expected);
2819         }
2820         if (found) {
2821             linebuf[curb] = '\0';
2822             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2823         }
2824     }
2825 }
2826
2827 /* **** functions for postcopy ***** */
2828
2829 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2830 {
2831     struct RAMBlock *block;
2832
2833     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2834         unsigned long *bitmap = block->bmap;
2835         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2836         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2837
2838         while (run_start < range) {
2839             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2840             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2841                               (run_end - run_start) << TARGET_PAGE_BITS);
2842             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2843         }
2844     }
2845 }
2846
2847 /**
2848  * postcopy_send_discard_bm_ram: discard a RAMBlock
2849  *
2850  * Returns zero on success
2851  *
2852  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2853  * Note: At this point the 'unsentmap' is the processed bitmap combined
2854  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2855  *
2856  * @ms: current migration state
2857  * @block: RAMBlock to discard
2858  */
2859 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2860 {
2861     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2862     unsigned long current;
2863     unsigned long *unsentmap = block->unsentmap;
2864
2865     for (current = 0; current < end; ) {
2866         unsigned long one = find_next_bit(unsentmap, end, current);
2867         unsigned long zero, discard_length;
2868
2869         if (one >= end) {
2870             break;
2871         }
2872
2873         zero = find_next_zero_bit(unsentmap, end, one + 1);
2874
2875         if (zero >= end) {
2876             discard_length = end - one;
2877         } else {
2878             discard_length = zero - one;
2879         }
2880         postcopy_discard_send_range(ms, one, discard_length);
2881         current = one + discard_length;
2882     }
2883
2884     return 0;
2885 }
2886
2887 /**
2888  * postcopy_each_ram_send_discard: discard all RAMBlocks
2889  *
2890  * Returns 0 for success or negative for error
2891  *
2892  * Utility for the outgoing postcopy code.
2893  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2894  *   passing it bitmap indexes and name.
2895  * (qemu_ram_foreach_block ends up passing unscaled lengths
2896  *  which would mean postcopy code would have to deal with target page)
2897  *
2898  * @ms: current migration state
2899  */
2900 static int postcopy_each_ram_send_discard(MigrationState *ms)
2901 {
2902     struct RAMBlock *block;
2903     int ret;
2904
2905     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2906         postcopy_discard_send_init(ms, block->idstr);
2907
2908         /*
2909          * Postcopy sends chunks of bitmap over the wire, but it
2910          * just needs indexes at this point, avoids it having
2911          * target page specific code.
2912          */
2913         ret = postcopy_send_discard_bm_ram(ms, block);
2914         postcopy_discard_send_finish(ms);
2915         if (ret) {
2916             return ret;
2917         }
2918     }
2919
2920     return 0;
2921 }
2922
2923 /**
2924  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2925  *
2926  * Helper for postcopy_chunk_hostpages; it's called twice to
2927  * canonicalize the two bitmaps, that are similar, but one is
2928  * inverted.
2929  *
2930  * Postcopy requires that all target pages in a hostpage are dirty or
2931  * clean, not a mix.  This function canonicalizes the bitmaps.
2932  *
2933  * @ms: current migration state
2934  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2935  *               otherwise we need to canonicalize partially dirty host pages
2936  * @block: block that contains the page we want to canonicalize
2937  */
2938 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2939                                           RAMBlock *block)
2940 {
2941     RAMState *rs = ram_state;
2942     unsigned long *bitmap = block->bmap;
2943     unsigned long *unsentmap = block->unsentmap;
2944     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2945     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2946     unsigned long run_start;
2947
2948     if (block->page_size == TARGET_PAGE_SIZE) {
2949         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2950         return;
2951     }
2952
2953     if (unsent_pass) {
2954         /* Find a sent page */
2955         run_start = find_next_zero_bit(unsentmap, pages, 0);
2956     } else {
2957         /* Find a dirty page */
2958         run_start = find_next_bit(bitmap, pages, 0);
2959     }
2960
2961     while (run_start < pages) {
2962
2963         /*
2964          * If the start of this run of pages is in the middle of a host
2965          * page, then we need to fixup this host page.
2966          */
2967         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2968             /* Find the end of this run */
2969             if (unsent_pass) {
2970                 run_start = find_next_bit(unsentmap, pages, run_start + 1);
2971             } else {
2972                 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2973             }
2974             /*
2975              * If the end isn't at the start of a host page, then the
2976              * run doesn't finish at the end of a host page
2977              * and we need to discard.
2978              */
2979         }
2980
2981         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2982             unsigned long page;
2983             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2984                                                              host_ratio);
2985             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2986
2987             /* Tell the destination to discard this page */
2988             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2989                 /* For the unsent_pass we:
2990                  *     discard partially sent pages
2991                  * For the !unsent_pass (dirty) we:
2992                  *     discard partially dirty pages that were sent
2993                  *     (any partially sent pages were already discarded
2994                  *     by the previous unsent_pass)
2995                  */
2996                 postcopy_discard_send_range(ms, fixup_start_addr, host_ratio);
2997             }
2998
2999             /* Clean up the bitmap */
3000             for (page = fixup_start_addr;
3001                  page < fixup_start_addr + host_ratio; page++) {
3002                 /* All pages in this host page are now not sent */
3003                 set_bit(page, unsentmap);
3004
3005                 /*
3006                  * Remark them as dirty, updating the count for any pages
3007                  * that weren't previously dirty.
3008                  */
3009                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
3010             }
3011         }
3012
3013         if (unsent_pass) {
3014             /* Find the next sent page for the next iteration */
3015             run_start = find_next_zero_bit(unsentmap, pages, run_start);
3016         } else {
3017             /* Find the next dirty page for the next iteration */
3018             run_start = find_next_bit(bitmap, pages, run_start);
3019         }
3020     }
3021 }
3022
3023 /**
3024  * postcopy_chunk_hostpages: discard any partially sent host page
3025  *
3026  * Utility for the outgoing postcopy code.
3027  *
3028  * Discard any partially sent host-page size chunks, mark any partially
3029  * dirty host-page size chunks as all dirty.  In this case the host-page
3030  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
3031  *
3032  * Returns zero on success
3033  *
3034  * @ms: current migration state
3035  * @block: block we want to work with
3036  */
3037 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
3038 {
3039     postcopy_discard_send_init(ms, block->idstr);
3040
3041     /* First pass: Discard all partially sent host pages */
3042     postcopy_chunk_hostpages_pass(ms, true, block);
3043     /*
3044      * Second pass: Ensure that all partially dirty host pages are made
3045      * fully dirty.
3046      */
3047     postcopy_chunk_hostpages_pass(ms, false, block);
3048
3049     postcopy_discard_send_finish(ms);
3050     return 0;
3051 }
3052
3053 /**
3054  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3055  *
3056  * Returns zero on success
3057  *
3058  * Transmit the set of pages to be discarded after precopy to the target
3059  * these are pages that:
3060  *     a) Have been previously transmitted but are now dirty again
3061  *     b) Pages that have never been transmitted, this ensures that
3062  *        any pages on the destination that have been mapped by background
3063  *        tasks get discarded (transparent huge pages is the specific concern)
3064  * Hopefully this is pretty sparse
3065  *
3066  * @ms: current migration state
3067  */
3068 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3069 {
3070     RAMState *rs = ram_state;
3071     RAMBlock *block;
3072     int ret;
3073
3074     rcu_read_lock();
3075
3076     /* This should be our last sync, the src is now paused */
3077     migration_bitmap_sync(rs);
3078
3079     /* Easiest way to make sure we don't resume in the middle of a host-page */
3080     rs->last_seen_block = NULL;
3081     rs->last_sent_block = NULL;
3082     rs->last_page = 0;
3083
3084     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3085         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3086         unsigned long *bitmap = block->bmap;
3087         unsigned long *unsentmap = block->unsentmap;
3088
3089         if (!unsentmap) {
3090             /* We don't have a safe way to resize the sentmap, so
3091              * if the bitmap was resized it will be NULL at this
3092              * point.
3093              */
3094             error_report("migration ram resized during precopy phase");
3095             rcu_read_unlock();
3096             return -EINVAL;
3097         }
3098         /* Deal with TPS != HPS and huge pages */
3099         ret = postcopy_chunk_hostpages(ms, block);
3100         if (ret) {
3101             rcu_read_unlock();
3102             return ret;
3103         }
3104
3105         /*
3106          * Update the unsentmap to be unsentmap = unsentmap | dirty
3107          */
3108         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3109 #ifdef DEBUG_POSTCOPY
3110         ram_debug_dump_bitmap(unsentmap, true, pages);
3111 #endif
3112     }
3113     trace_ram_postcopy_send_discard_bitmap();
3114
3115     ret = postcopy_each_ram_send_discard(ms);
3116     rcu_read_unlock();
3117
3118     return ret;
3119 }
3120
3121 /**
3122  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3123  *
3124  * Returns zero on success
3125  *
3126  * @rbname: name of the RAMBlock of the request. NULL means the
3127  *          same that last one.
3128  * @start: RAMBlock starting page
3129  * @length: RAMBlock size
3130  */
3131 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3132 {
3133     int ret = -1;
3134
3135     trace_ram_discard_range(rbname, start, length);
3136
3137     rcu_read_lock();
3138     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3139
3140     if (!rb) {
3141         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3142         goto err;
3143     }
3144
3145     /*
3146      * On source VM, we don't need to update the received bitmap since
3147      * we don't even have one.
3148      */
3149     if (rb->receivedmap) {
3150         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3151                      length >> qemu_target_page_bits());
3152     }
3153
3154     ret = ram_block_discard_range(rb, start, length);
3155
3156 err:
3157     rcu_read_unlock();
3158
3159     return ret;
3160 }
3161
3162 /*
3163  * For every allocation, we will try not to crash the VM if the
3164  * allocation failed.
3165  */
3166 static int xbzrle_init(void)
3167 {
3168     Error *local_err = NULL;
3169
3170     if (!migrate_use_xbzrle()) {
3171         return 0;
3172     }
3173
3174     XBZRLE_cache_lock();
3175
3176     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3177     if (!XBZRLE.zero_target_page) {
3178         error_report("%s: Error allocating zero page", __func__);
3179         goto err_out;
3180     }
3181
3182     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3183                               TARGET_PAGE_SIZE, &local_err);
3184     if (!XBZRLE.cache) {
3185         error_report_err(local_err);
3186         goto free_zero_page;
3187     }
3188
3189     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3190     if (!XBZRLE.encoded_buf) {
3191         error_report("%s: Error allocating encoded_buf", __func__);
3192         goto free_cache;
3193     }
3194
3195     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3196     if (!XBZRLE.current_buf) {
3197         error_report("%s: Error allocating current_buf", __func__);
3198         goto free_encoded_buf;
3199     }
3200
3201     /* We are all good */
3202     XBZRLE_cache_unlock();
3203     return 0;
3204
3205 free_encoded_buf:
3206     g_free(XBZRLE.encoded_buf);
3207     XBZRLE.encoded_buf = NULL;
3208 free_cache:
3209     cache_fini(XBZRLE.cache);
3210     XBZRLE.cache = NULL;
3211 free_zero_page:
3212     g_free(XBZRLE.zero_target_page);
3213     XBZRLE.zero_target_page = NULL;
3214 err_out:
3215     XBZRLE_cache_unlock();
3216     return -ENOMEM;
3217 }
3218
3219 static int ram_state_init(RAMState **rsp)
3220 {
3221     *rsp = g_try_new0(RAMState, 1);
3222
3223     if (!*rsp) {
3224         error_report("%s: Init ramstate fail", __func__);
3225         return -1;
3226     }
3227
3228     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3229     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3230     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3231
3232     /*
3233      * Count the total number of pages used by ram blocks not including any
3234      * gaps due to alignment or unplugs.
3235      * This must match with the initial values of dirty bitmap.
3236      */
3237     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3238     ram_state_reset(*rsp);
3239
3240     return 0;
3241 }
3242
3243 static void ram_list_init_bitmaps(void)
3244 {
3245     MigrationState *ms = migrate_get_current();
3246     RAMBlock *block;
3247     unsigned long pages;
3248     uint8_t shift;
3249
3250     /* Skip setting bitmap if there is no RAM */
3251     if (ram_bytes_total()) {
3252         shift = ms->clear_bitmap_shift;
3253         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3254             error_report("clear_bitmap_shift (%u) too big, using "
3255                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3256             shift = CLEAR_BITMAP_SHIFT_MAX;
3257         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3258             error_report("clear_bitmap_shift (%u) too small, using "
3259                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3260             shift = CLEAR_BITMAP_SHIFT_MIN;
3261         }
3262
3263         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3264             pages = block->max_length >> TARGET_PAGE_BITS;
3265             /*
3266              * The initial dirty bitmap for migration must be set with all
3267              * ones to make sure we'll migrate every guest RAM page to
3268              * destination.
3269              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3270              * new migration after a failed migration, ram_list.
3271              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3272              * guest memory.
3273              */
3274             block->bmap = bitmap_new(pages);
3275             bitmap_set(block->bmap, 0, pages);
3276             block->clear_bmap_shift = shift;
3277             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3278             if (migrate_postcopy_ram()) {
3279                 block->unsentmap = bitmap_new(pages);
3280                 bitmap_set(block->unsentmap, 0, pages);
3281             }
3282         }
3283     }
3284 }
3285
3286 static void ram_init_bitmaps(RAMState *rs)
3287 {
3288     /* For memory_global_dirty_log_start below.  */
3289     qemu_mutex_lock_iothread();
3290     qemu_mutex_lock_ramlist();
3291     rcu_read_lock();
3292
3293     ram_list_init_bitmaps();
3294     memory_global_dirty_log_start();
3295     migration_bitmap_sync_precopy(rs);
3296
3297     rcu_read_unlock();
3298     qemu_mutex_unlock_ramlist();
3299     qemu_mutex_unlock_iothread();
3300 }
3301
3302 static int ram_init_all(RAMState **rsp)
3303 {
3304     if (ram_state_init(rsp)) {
3305         return -1;
3306     }
3307
3308     if (xbzrle_init()) {
3309         ram_state_cleanup(rsp);
3310         return -1;
3311     }
3312
3313     ram_init_bitmaps(*rsp);
3314
3315     return 0;
3316 }
3317
3318 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3319 {
3320     RAMBlock *block;
3321     uint64_t pages = 0;
3322
3323     /*
3324      * Postcopy is not using xbzrle/compression, so no need for that.
3325      * Also, since source are already halted, we don't need to care
3326      * about dirty page logging as well.
3327      */
3328
3329     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3330         pages += bitmap_count_one(block->bmap,
3331                                   block->used_length >> TARGET_PAGE_BITS);
3332     }
3333
3334     /* This may not be aligned with current bitmaps. Recalculate. */
3335     rs->migration_dirty_pages = pages;
3336
3337     rs->last_seen_block = NULL;
3338     rs->last_sent_block = NULL;
3339     rs->last_page = 0;
3340     rs->last_version = ram_list.version;
3341     /*
3342      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3343      * matter what we have sent.
3344      */
3345     rs->ram_bulk_stage = false;
3346
3347     /* Update RAMState cache of output QEMUFile */
3348     rs->f = out;
3349
3350     trace_ram_state_resume_prepare(pages);
3351 }
3352
3353 /*
3354  * This function clears bits of the free pages reported by the caller from the
3355  * migration dirty bitmap. @addr is the host address corresponding to the
3356  * start of the continuous guest free pages, and @len is the total bytes of
3357  * those pages.
3358  */
3359 void qemu_guest_free_page_hint(void *addr, size_t len)
3360 {
3361     RAMBlock *block;
3362     ram_addr_t offset;
3363     size_t used_len, start, npages;
3364     MigrationState *s = migrate_get_current();
3365
3366     /* This function is currently expected to be used during live migration */
3367     if (!migration_is_setup_or_active(s->state)) {
3368         return;
3369     }
3370
3371     for (; len > 0; len -= used_len, addr += used_len) {
3372         block = qemu_ram_block_from_host(addr, false, &offset);
3373         if (unlikely(!block || offset >= block->used_length)) {
3374             /*
3375              * The implementation might not support RAMBlock resize during
3376              * live migration, but it could happen in theory with future
3377              * updates. So we add a check here to capture that case.
3378              */
3379             error_report_once("%s unexpected error", __func__);
3380             return;
3381         }
3382
3383         if (len <= block->used_length - offset) {
3384             used_len = len;
3385         } else {
3386             used_len = block->used_length - offset;
3387         }
3388
3389         start = offset >> TARGET_PAGE_BITS;
3390         npages = used_len >> TARGET_PAGE_BITS;
3391
3392         qemu_mutex_lock(&ram_state->bitmap_mutex);
3393         ram_state->migration_dirty_pages -=
3394                       bitmap_count_one_with_offset(block->bmap, start, npages);
3395         bitmap_clear(block->bmap, start, npages);
3396         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3397     }
3398 }
3399
3400 /*
3401  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3402  * long-running RCU critical section.  When rcu-reclaims in the code
3403  * start to become numerous it will be necessary to reduce the
3404  * granularity of these critical sections.
3405  */
3406
3407 /**
3408  * ram_save_setup: Setup RAM for migration
3409  *
3410  * Returns zero to indicate success and negative for error
3411  *
3412  * @f: QEMUFile where to send the data
3413  * @opaque: RAMState pointer
3414  */
3415 static int ram_save_setup(QEMUFile *f, void *opaque)
3416 {
3417     RAMState **rsp = opaque;
3418     RAMBlock *block;
3419
3420     if (compress_threads_save_setup()) {
3421         return -1;
3422     }
3423
3424     /* migration has already setup the bitmap, reuse it. */
3425     if (!migration_in_colo_state()) {
3426         if (ram_init_all(rsp) != 0) {
3427             compress_threads_save_cleanup();
3428             return -1;
3429         }
3430     }
3431     (*rsp)->f = f;
3432
3433     rcu_read_lock();
3434
3435     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3436
3437     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3438         qemu_put_byte(f, strlen(block->idstr));
3439         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3440         qemu_put_be64(f, block->used_length);
3441         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3442             qemu_put_be64(f, block->page_size);
3443         }
3444         if (migrate_ignore_shared()) {
3445             qemu_put_be64(f, block->mr->addr);
3446         }
3447     }
3448
3449     rcu_read_unlock();
3450
3451     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3452     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3453
3454     multifd_send_sync_main(*rsp);
3455     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3456     qemu_fflush(f);
3457
3458     return 0;
3459 }
3460
3461 /**
3462  * ram_save_iterate: iterative stage for migration
3463  *
3464  * Returns zero to indicate success and negative for error
3465  *
3466  * @f: QEMUFile where to send the data
3467  * @opaque: RAMState pointer
3468  */
3469 static int ram_save_iterate(QEMUFile *f, void *opaque)
3470 {
3471     RAMState **temp = opaque;
3472     RAMState *rs = *temp;
3473     int ret;
3474     int i;
3475     int64_t t0;
3476     int done = 0;
3477
3478     if (blk_mig_bulk_active()) {
3479         /* Avoid transferring ram during bulk phase of block migration as
3480          * the bulk phase will usually take a long time and transferring
3481          * ram updates during that time is pointless. */
3482         goto out;
3483     }
3484
3485     rcu_read_lock();
3486     if (ram_list.version != rs->last_version) {
3487         ram_state_reset(rs);
3488     }
3489
3490     /* Read version before ram_list.blocks */
3491     smp_rmb();
3492
3493     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3494
3495     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3496     i = 0;
3497     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3498             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3499         int pages;
3500
3501         if (qemu_file_get_error(f)) {
3502             break;
3503         }
3504
3505         pages = ram_find_and_save_block(rs, false);
3506         /* no more pages to sent */
3507         if (pages == 0) {
3508             done = 1;
3509             break;
3510         }
3511
3512         if (pages < 0) {
3513             qemu_file_set_error(f, pages);
3514             break;
3515         }
3516
3517         rs->target_page_count += pages;
3518
3519         /* we want to check in the 1st loop, just in case it was the 1st time
3520            and we had to sync the dirty bitmap.
3521            qemu_clock_get_ns() is a bit expensive, so we only check each some
3522            iterations
3523         */
3524         if ((i & 63) == 0) {
3525             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3526             if (t1 > MAX_WAIT) {
3527                 trace_ram_save_iterate_big_wait(t1, i);
3528                 break;
3529             }
3530         }
3531         i++;
3532     }
3533     rcu_read_unlock();
3534
3535     /*
3536      * Must occur before EOS (or any QEMUFile operation)
3537      * because of RDMA protocol.
3538      */
3539     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3540
3541 out:
3542     multifd_send_sync_main(rs);
3543     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3544     qemu_fflush(f);
3545     ram_counters.transferred += 8;
3546
3547     ret = qemu_file_get_error(f);
3548     if (ret < 0) {
3549         return ret;
3550     }
3551
3552     return done;
3553 }
3554
3555 /**
3556  * ram_save_complete: function called to send the remaining amount of ram
3557  *
3558  * Returns zero to indicate success or negative on error
3559  *
3560  * Called with iothread lock
3561  *
3562  * @f: QEMUFile where to send the data
3563  * @opaque: RAMState pointer
3564  */
3565 static int ram_save_complete(QEMUFile *f, void *opaque)
3566 {
3567     RAMState **temp = opaque;
3568     RAMState *rs = *temp;
3569     int ret = 0;
3570
3571     rcu_read_lock();
3572
3573     if (!migration_in_postcopy()) {
3574         migration_bitmap_sync_precopy(rs);
3575     }
3576
3577     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3578
3579     /* try transferring iterative blocks of memory */
3580
3581     /* flush all remaining blocks regardless of rate limiting */
3582     while (true) {
3583         int pages;
3584
3585         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3586         /* no more blocks to sent */
3587         if (pages == 0) {
3588             break;
3589         }
3590         if (pages < 0) {
3591             ret = pages;
3592             break;
3593         }
3594     }
3595
3596     flush_compressed_data(rs);
3597     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3598
3599     rcu_read_unlock();
3600
3601     multifd_send_sync_main(rs);
3602     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3603     qemu_fflush(f);
3604
3605     return ret;
3606 }
3607
3608 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3609                              uint64_t *res_precopy_only,
3610                              uint64_t *res_compatible,
3611                              uint64_t *res_postcopy_only)
3612 {
3613     RAMState **temp = opaque;
3614     RAMState *rs = *temp;
3615     uint64_t remaining_size;
3616
3617     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3618
3619     if (!migration_in_postcopy() &&
3620         remaining_size < max_size) {
3621         qemu_mutex_lock_iothread();
3622         rcu_read_lock();
3623         migration_bitmap_sync_precopy(rs);
3624         rcu_read_unlock();
3625         qemu_mutex_unlock_iothread();
3626         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3627     }
3628
3629     if (migrate_postcopy_ram()) {
3630         /* We can do postcopy, and all the data is postcopiable */
3631         *res_compatible += remaining_size;
3632     } else {
3633         *res_precopy_only += remaining_size;
3634     }
3635 }
3636
3637 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3638 {
3639     unsigned int xh_len;
3640     int xh_flags;
3641     uint8_t *loaded_data;
3642
3643     /* extract RLE header */
3644     xh_flags = qemu_get_byte(f);
3645     xh_len = qemu_get_be16(f);
3646
3647     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3648         error_report("Failed to load XBZRLE page - wrong compression!");
3649         return -1;
3650     }
3651
3652     if (xh_len > TARGET_PAGE_SIZE) {
3653         error_report("Failed to load XBZRLE page - len overflow!");
3654         return -1;
3655     }
3656     loaded_data = XBZRLE.decoded_buf;
3657     /* load data and decode */
3658     /* it can change loaded_data to point to an internal buffer */
3659     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3660
3661     /* decode RLE */
3662     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3663                              TARGET_PAGE_SIZE) == -1) {
3664         error_report("Failed to load XBZRLE page - decode error!");
3665         return -1;
3666     }
3667
3668     return 0;
3669 }
3670
3671 /**
3672  * ram_block_from_stream: read a RAMBlock id from the migration stream
3673  *
3674  * Must be called from within a rcu critical section.
3675  *
3676  * Returns a pointer from within the RCU-protected ram_list.
3677  *
3678  * @f: QEMUFile where to read the data from
3679  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3680  */
3681 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3682 {
3683     static RAMBlock *block = NULL;
3684     char id[256];
3685     uint8_t len;
3686
3687     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3688         if (!block) {
3689             error_report("Ack, bad migration stream!");
3690             return NULL;
3691         }
3692         return block;
3693     }
3694
3695     len = qemu_get_byte(f);
3696     qemu_get_buffer(f, (uint8_t *)id, len);
3697     id[len] = 0;
3698
3699     block = qemu_ram_block_by_name(id);
3700     if (!block) {
3701         error_report("Can't find block %s", id);
3702         return NULL;
3703     }
3704
3705     if (ramblock_is_ignored(block)) {
3706         error_report("block %s should not be migrated !", id);
3707         return NULL;
3708     }
3709
3710     return block;
3711 }
3712
3713 static inline void *host_from_ram_block_offset(RAMBlock *block,
3714                                                ram_addr_t offset)
3715 {
3716     if (!offset_in_ramblock(block, offset)) {
3717         return NULL;
3718     }
3719
3720     return block->host + offset;
3721 }
3722
3723 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3724                                                  ram_addr_t offset)
3725 {
3726     if (!offset_in_ramblock(block, offset)) {
3727         return NULL;
3728     }
3729     if (!block->colo_cache) {
3730         error_report("%s: colo_cache is NULL in block :%s",
3731                      __func__, block->idstr);
3732         return NULL;
3733     }
3734
3735     /*
3736     * During colo checkpoint, we need bitmap of these migrated pages.
3737     * It help us to decide which pages in ram cache should be flushed
3738     * into VM's RAM later.
3739     */
3740     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3741         ram_state->migration_dirty_pages++;
3742     }
3743     return block->colo_cache + offset;
3744 }
3745
3746 /**
3747  * ram_handle_compressed: handle the zero page case
3748  *
3749  * If a page (or a whole RDMA chunk) has been
3750  * determined to be zero, then zap it.
3751  *
3752  * @host: host address for the zero page
3753  * @ch: what the page is filled from.  We only support zero
3754  * @size: size of the zero page
3755  */
3756 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3757 {
3758     if (ch != 0 || !is_zero_range(host, size)) {
3759         memset(host, ch, size);
3760     }
3761 }
3762
3763 /* return the size after decompression, or negative value on error */
3764 static int
3765 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3766                      const uint8_t *source, size_t source_len)
3767 {
3768     int err;
3769
3770     err = inflateReset(stream);
3771     if (err != Z_OK) {
3772         return -1;
3773     }
3774
3775     stream->avail_in = source_len;
3776     stream->next_in = (uint8_t *)source;
3777     stream->avail_out = dest_len;
3778     stream->next_out = dest;
3779
3780     err = inflate(stream, Z_NO_FLUSH);
3781     if (err != Z_STREAM_END) {
3782         return -1;
3783     }
3784
3785     return stream->total_out;
3786 }
3787
3788 static void *do_data_decompress(void *opaque)
3789 {
3790     DecompressParam *param = opaque;
3791     unsigned long pagesize;
3792     uint8_t *des;
3793     int len, ret;
3794
3795     qemu_mutex_lock(&param->mutex);
3796     while (!param->quit) {
3797         if (param->des) {
3798             des = param->des;
3799             len = param->len;
3800             param->des = 0;
3801             qemu_mutex_unlock(&param->mutex);
3802
3803             pagesize = TARGET_PAGE_SIZE;
3804
3805             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3806                                        param->compbuf, len);
3807             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3808                 error_report("decompress data failed");
3809                 qemu_file_set_error(decomp_file, ret);
3810             }
3811
3812             qemu_mutex_lock(&decomp_done_lock);
3813             param->done = true;
3814             qemu_cond_signal(&decomp_done_cond);
3815             qemu_mutex_unlock(&decomp_done_lock);
3816
3817             qemu_mutex_lock(&param->mutex);
3818         } else {
3819             qemu_cond_wait(&param->cond, &param->mutex);
3820         }
3821     }
3822     qemu_mutex_unlock(&param->mutex);
3823
3824     return NULL;
3825 }
3826
3827 static int wait_for_decompress_done(void)
3828 {
3829     int idx, thread_count;
3830
3831     if (!migrate_use_compression()) {
3832         return 0;
3833     }
3834
3835     thread_count = migrate_decompress_threads();
3836     qemu_mutex_lock(&decomp_done_lock);
3837     for (idx = 0; idx < thread_count; idx++) {
3838         while (!decomp_param[idx].done) {
3839             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3840         }
3841     }
3842     qemu_mutex_unlock(&decomp_done_lock);
3843     return qemu_file_get_error(decomp_file);
3844 }
3845
3846 static void compress_threads_load_cleanup(void)
3847 {
3848     int i, thread_count;
3849
3850     if (!migrate_use_compression()) {
3851         return;
3852     }
3853     thread_count = migrate_decompress_threads();
3854     for (i = 0; i < thread_count; i++) {
3855         /*
3856          * we use it as a indicator which shows if the thread is
3857          * properly init'd or not
3858          */
3859         if (!decomp_param[i].compbuf) {
3860             break;
3861         }
3862
3863         qemu_mutex_lock(&decomp_param[i].mutex);
3864         decomp_param[i].quit = true;
3865         qemu_cond_signal(&decomp_param[i].cond);
3866         qemu_mutex_unlock(&decomp_param[i].mutex);
3867     }
3868     for (i = 0; i < thread_count; i++) {
3869         if (!decomp_param[i].compbuf) {
3870             break;
3871         }
3872
3873         qemu_thread_join(decompress_threads + i);
3874         qemu_mutex_destroy(&decomp_param[i].mutex);
3875         qemu_cond_destroy(&decomp_param[i].cond);
3876         inflateEnd(&decomp_param[i].stream);
3877         g_free(decomp_param[i].compbuf);
3878         decomp_param[i].compbuf = NULL;
3879     }
3880     g_free(decompress_threads);
3881     g_free(decomp_param);
3882     decompress_threads = NULL;
3883     decomp_param = NULL;
3884     decomp_file = NULL;
3885 }
3886
3887 static int compress_threads_load_setup(QEMUFile *f)
3888 {
3889     int i, thread_count;
3890
3891     if (!migrate_use_compression()) {
3892         return 0;
3893     }
3894
3895     thread_count = migrate_decompress_threads();
3896     decompress_threads = g_new0(QemuThread, thread_count);
3897     decomp_param = g_new0(DecompressParam, thread_count);
3898     qemu_mutex_init(&decomp_done_lock);
3899     qemu_cond_init(&decomp_done_cond);
3900     decomp_file = f;
3901     for (i = 0; i < thread_count; i++) {
3902         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3903             goto exit;
3904         }
3905
3906         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3907         qemu_mutex_init(&decomp_param[i].mutex);
3908         qemu_cond_init(&decomp_param[i].cond);
3909         decomp_param[i].done = true;
3910         decomp_param[i].quit = false;
3911         qemu_thread_create(decompress_threads + i, "decompress",
3912                            do_data_decompress, decomp_param + i,
3913                            QEMU_THREAD_JOINABLE);
3914     }
3915     return 0;
3916 exit:
3917     compress_threads_load_cleanup();
3918     return -1;
3919 }
3920
3921 static void decompress_data_with_multi_threads(QEMUFile *f,
3922                                                void *host, int len)
3923 {
3924     int idx, thread_count;
3925
3926     thread_count = migrate_decompress_threads();
3927     qemu_mutex_lock(&decomp_done_lock);
3928     while (true) {
3929         for (idx = 0; idx < thread_count; idx++) {
3930             if (decomp_param[idx].done) {
3931                 decomp_param[idx].done = false;
3932                 qemu_mutex_lock(&decomp_param[idx].mutex);
3933                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3934                 decomp_param[idx].des = host;
3935                 decomp_param[idx].len = len;
3936                 qemu_cond_signal(&decomp_param[idx].cond);
3937                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3938                 break;
3939             }
3940         }
3941         if (idx < thread_count) {
3942             break;
3943         } else {
3944             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3945         }
3946     }
3947     qemu_mutex_unlock(&decomp_done_lock);
3948 }
3949
3950 /*
3951  * colo cache: this is for secondary VM, we cache the whole
3952  * memory of the secondary VM, it is need to hold the global lock
3953  * to call this helper.
3954  */
3955 int colo_init_ram_cache(void)
3956 {
3957     RAMBlock *block;
3958
3959     rcu_read_lock();
3960     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3961         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3962                                                 NULL,
3963                                                 false);
3964         if (!block->colo_cache) {
3965             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3966                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3967                          block->used_length);
3968             goto out_locked;
3969         }
3970         memcpy(block->colo_cache, block->host, block->used_length);
3971     }
3972     rcu_read_unlock();
3973     /*
3974     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3975     * with to decide which page in cache should be flushed into SVM's RAM. Here
3976     * we use the same name 'ram_bitmap' as for migration.
3977     */
3978     if (ram_bytes_total()) {
3979         RAMBlock *block;
3980
3981         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3982             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3983
3984             block->bmap = bitmap_new(pages);
3985             bitmap_set(block->bmap, 0, pages);
3986         }
3987     }
3988     ram_state = g_new0(RAMState, 1);
3989     ram_state->migration_dirty_pages = 0;
3990     qemu_mutex_init(&ram_state->bitmap_mutex);
3991     memory_global_dirty_log_start();
3992
3993     return 0;
3994
3995 out_locked:
3996
3997     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3998         if (block->colo_cache) {
3999             qemu_anon_ram_free(block->colo_cache, block->used_length);
4000             block->colo_cache = NULL;
4001         }
4002     }
4003
4004     rcu_read_unlock();
4005     return -errno;
4006 }
4007
4008 /* It is need to hold the global lock to call this helper */
4009 void colo_release_ram_cache(void)
4010 {
4011     RAMBlock *block;
4012
4013     memory_global_dirty_log_stop();
4014     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4015         g_free(block->bmap);
4016         block->bmap = NULL;
4017     }
4018
4019     rcu_read_lock();
4020
4021     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4022         if (block->colo_cache) {
4023             qemu_anon_ram_free(block->colo_cache, block->used_length);
4024             block->colo_cache = NULL;
4025         }
4026     }
4027
4028     rcu_read_unlock();
4029     qemu_mutex_destroy(&ram_state->bitmap_mutex);
4030     g_free(ram_state);
4031     ram_state = NULL;
4032 }
4033
4034 /**
4035  * ram_load_setup: Setup RAM for migration incoming side
4036  *
4037  * Returns zero to indicate success and negative for error
4038  *
4039  * @f: QEMUFile where to receive the data
4040  * @opaque: RAMState pointer
4041  */
4042 static int ram_load_setup(QEMUFile *f, void *opaque)
4043 {
4044     if (compress_threads_load_setup(f)) {
4045         return -1;
4046     }
4047
4048     xbzrle_load_setup();
4049     ramblock_recv_map_init();
4050
4051     return 0;
4052 }
4053
4054 static int ram_load_cleanup(void *opaque)
4055 {
4056     RAMBlock *rb;
4057
4058     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4059         if (ramblock_is_pmem(rb)) {
4060             pmem_persist(rb->host, rb->used_length);
4061         }
4062     }
4063
4064     xbzrle_load_cleanup();
4065     compress_threads_load_cleanup();
4066
4067     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4068         g_free(rb->receivedmap);
4069         rb->receivedmap = NULL;
4070     }
4071
4072     return 0;
4073 }
4074
4075 /**
4076  * ram_postcopy_incoming_init: allocate postcopy data structures
4077  *
4078  * Returns 0 for success and negative if there was one error
4079  *
4080  * @mis: current migration incoming state
4081  *
4082  * Allocate data structures etc needed by incoming migration with
4083  * postcopy-ram. postcopy-ram's similarly names
4084  * postcopy_ram_incoming_init does the work.
4085  */
4086 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4087 {
4088     return postcopy_ram_incoming_init(mis);
4089 }
4090
4091 /**
4092  * ram_load_postcopy: load a page in postcopy case
4093  *
4094  * Returns 0 for success or -errno in case of error
4095  *
4096  * Called in postcopy mode by ram_load().
4097  * rcu_read_lock is taken prior to this being called.
4098  *
4099  * @f: QEMUFile where to send the data
4100  */
4101 static int ram_load_postcopy(QEMUFile *f)
4102 {
4103     int flags = 0, ret = 0;
4104     bool place_needed = false;
4105     bool matches_target_page_size = false;
4106     MigrationIncomingState *mis = migration_incoming_get_current();
4107     /* Temporary page that is later 'placed' */
4108     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4109     void *last_host = NULL;
4110     bool all_zero = false;
4111
4112     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4113         ram_addr_t addr;
4114         void *host = NULL;
4115         void *page_buffer = NULL;
4116         void *place_source = NULL;
4117         RAMBlock *block = NULL;
4118         uint8_t ch;
4119
4120         addr = qemu_get_be64(f);
4121
4122         /*
4123          * If qemu file error, we should stop here, and then "addr"
4124          * may be invalid
4125          */
4126         ret = qemu_file_get_error(f);
4127         if (ret) {
4128             break;
4129         }
4130
4131         flags = addr & ~TARGET_PAGE_MASK;
4132         addr &= TARGET_PAGE_MASK;
4133
4134         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4135         place_needed = false;
4136         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4137             block = ram_block_from_stream(f, flags);
4138
4139             host = host_from_ram_block_offset(block, addr);
4140             if (!host) {
4141                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4142                 ret = -EINVAL;
4143                 break;
4144             }
4145             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4146             /*
4147              * Postcopy requires that we place whole host pages atomically;
4148              * these may be huge pages for RAMBlocks that are backed by
4149              * hugetlbfs.
4150              * To make it atomic, the data is read into a temporary page
4151              * that's moved into place later.
4152              * The migration protocol uses,  possibly smaller, target-pages
4153              * however the source ensures it always sends all the components
4154              * of a host page in order.
4155              */
4156             page_buffer = postcopy_host_page +
4157                           ((uintptr_t)host & (block->page_size - 1));
4158             /* If all TP are zero then we can optimise the place */
4159             if (!((uintptr_t)host & (block->page_size - 1))) {
4160                 all_zero = true;
4161             } else {
4162                 /* not the 1st TP within the HP */
4163                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4164                     error_report("Non-sequential target page %p/%p",
4165                                   host, last_host);
4166                     ret = -EINVAL;
4167                     break;
4168                 }
4169             }
4170
4171
4172             /*
4173              * If it's the last part of a host page then we place the host
4174              * page
4175              */
4176             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4177                                      (block->page_size - 1)) == 0;
4178             place_source = postcopy_host_page;
4179         }
4180         last_host = host;
4181
4182         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4183         case RAM_SAVE_FLAG_ZERO:
4184             ch = qemu_get_byte(f);
4185             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4186             if (ch) {
4187                 all_zero = false;
4188             }
4189             break;
4190
4191         case RAM_SAVE_FLAG_PAGE:
4192             all_zero = false;
4193             if (!matches_target_page_size) {
4194                 /* For huge pages, we always use temporary buffer */
4195                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4196             } else {
4197                 /*
4198                  * For small pages that matches target page size, we
4199                  * avoid the qemu_file copy.  Instead we directly use
4200                  * the buffer of QEMUFile to place the page.  Note: we
4201                  * cannot do any QEMUFile operation before using that
4202                  * buffer to make sure the buffer is valid when
4203                  * placing the page.
4204                  */
4205                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4206                                          TARGET_PAGE_SIZE);
4207             }
4208             break;
4209         case RAM_SAVE_FLAG_EOS:
4210             /* normal exit */
4211             multifd_recv_sync_main();
4212             break;
4213         default:
4214             error_report("Unknown combination of migration flags: %#x"
4215                          " (postcopy mode)", flags);
4216             ret = -EINVAL;
4217             break;
4218         }
4219
4220         /* Detect for any possible file errors */
4221         if (!ret && qemu_file_get_error(f)) {
4222             ret = qemu_file_get_error(f);
4223         }
4224
4225         if (!ret && place_needed) {
4226             /* This gets called at the last target page in the host page */
4227             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4228
4229             if (all_zero) {
4230                 ret = postcopy_place_page_zero(mis, place_dest,
4231                                                block);
4232             } else {
4233                 ret = postcopy_place_page(mis, place_dest,
4234                                           place_source, block);
4235             }
4236         }
4237     }
4238
4239     return ret;
4240 }
4241
4242 static bool postcopy_is_advised(void)
4243 {
4244     PostcopyState ps = postcopy_state_get();
4245     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4246 }
4247
4248 static bool postcopy_is_running(void)
4249 {
4250     PostcopyState ps = postcopy_state_get();
4251     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4252 }
4253
4254 /*
4255  * Flush content of RAM cache into SVM's memory.
4256  * Only flush the pages that be dirtied by PVM or SVM or both.
4257  */
4258 static void colo_flush_ram_cache(void)
4259 {
4260     RAMBlock *block = NULL;
4261     void *dst_host;
4262     void *src_host;
4263     unsigned long offset = 0;
4264
4265     memory_global_dirty_log_sync();
4266     rcu_read_lock();
4267     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4268         ramblock_sync_dirty_bitmap(ram_state, block);
4269     }
4270     rcu_read_unlock();
4271
4272     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4273     rcu_read_lock();
4274     block = QLIST_FIRST_RCU(&ram_list.blocks);
4275
4276     while (block) {
4277         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4278
4279         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4280             offset = 0;
4281             block = QLIST_NEXT_RCU(block, next);
4282         } else {
4283             migration_bitmap_clear_dirty(ram_state, block, offset);
4284             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4285             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4286             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4287         }
4288     }
4289
4290     rcu_read_unlock();
4291     trace_colo_flush_ram_cache_end();
4292 }
4293
4294 /**
4295  * ram_load_precopy: load pages in precopy case
4296  *
4297  * Returns 0 for success or -errno in case of error
4298  *
4299  * Called in precopy mode by ram_load().
4300  * rcu_read_lock is taken prior to this being called.
4301  *
4302  * @f: QEMUFile where to send the data
4303  */
4304 static int ram_load_precopy(QEMUFile *f)
4305 {
4306     int flags = 0, ret = 0, invalid_flags = 0, len = 0;
4307     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4308     bool postcopy_advised = postcopy_is_advised();
4309     if (!migrate_use_compression()) {
4310         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4311     }
4312
4313     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4314         ram_addr_t addr, total_ram_bytes;
4315         void *host = NULL;
4316         uint8_t ch;
4317
4318         addr = qemu_get_be64(f);
4319         flags = addr & ~TARGET_PAGE_MASK;
4320         addr &= TARGET_PAGE_MASK;
4321
4322         if (flags & invalid_flags) {
4323             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4324                 error_report("Received an unexpected compressed page");
4325             }
4326
4327             ret = -EINVAL;
4328             break;
4329         }
4330
4331         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4332                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4333             RAMBlock *block = ram_block_from_stream(f, flags);
4334
4335             /*
4336              * After going into COLO, we should load the Page into colo_cache.
4337              */
4338             if (migration_incoming_in_colo_state()) {
4339                 host = colo_cache_from_block_offset(block, addr);
4340             } else {
4341                 host = host_from_ram_block_offset(block, addr);
4342             }
4343             if (!host) {
4344                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4345                 ret = -EINVAL;
4346                 break;
4347             }
4348
4349             if (!migration_incoming_in_colo_state()) {
4350                 ramblock_recv_bitmap_set(block, host);
4351             }
4352
4353             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4354         }
4355
4356         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4357         case RAM_SAVE_FLAG_MEM_SIZE:
4358             /* Synchronize RAM block list */
4359             total_ram_bytes = addr;
4360             while (!ret && total_ram_bytes) {
4361                 RAMBlock *block;
4362                 char id[256];
4363                 ram_addr_t length;
4364
4365                 len = qemu_get_byte(f);
4366                 qemu_get_buffer(f, (uint8_t *)id, len);
4367                 id[len] = 0;
4368                 length = qemu_get_be64(f);
4369
4370                 block = qemu_ram_block_by_name(id);
4371                 if (block && !qemu_ram_is_migratable(block)) {
4372                     error_report("block %s should not be migrated !", id);
4373                     ret = -EINVAL;
4374                 } else if (block) {
4375                     if (length != block->used_length) {
4376                         Error *local_err = NULL;
4377
4378                         ret = qemu_ram_resize(block, length,
4379                                               &local_err);
4380                         if (local_err) {
4381                             error_report_err(local_err);
4382                         }
4383                     }
4384                     /* For postcopy we need to check hugepage sizes match */
4385                     if (postcopy_advised &&
4386                         block->page_size != qemu_host_page_size) {
4387                         uint64_t remote_page_size = qemu_get_be64(f);
4388                         if (remote_page_size != block->page_size) {
4389                             error_report("Mismatched RAM page size %s "
4390                                          "(local) %zd != %" PRId64,
4391                                          id, block->page_size,
4392                                          remote_page_size);
4393                             ret = -EINVAL;
4394                         }
4395                     }
4396                     if (migrate_ignore_shared()) {
4397                         hwaddr addr = qemu_get_be64(f);
4398                         if (ramblock_is_ignored(block) &&
4399                             block->mr->addr != addr) {
4400                             error_report("Mismatched GPAs for block %s "
4401                                          "%" PRId64 "!= %" PRId64,
4402                                          id, (uint64_t)addr,
4403                                          (uint64_t)block->mr->addr);
4404                             ret = -EINVAL;
4405                         }
4406                     }
4407                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4408                                           block->idstr);
4409                 } else {
4410                     error_report("Unknown ramblock \"%s\", cannot "
4411                                  "accept migration", id);
4412                     ret = -EINVAL;
4413                 }
4414
4415                 total_ram_bytes -= length;
4416             }
4417             break;
4418
4419         case RAM_SAVE_FLAG_ZERO:
4420             ch = qemu_get_byte(f);
4421             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4422             break;
4423
4424         case RAM_SAVE_FLAG_PAGE:
4425             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4426             break;
4427
4428         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4429             len = qemu_get_be32(f);
4430             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4431                 error_report("Invalid compressed data length: %d", len);
4432                 ret = -EINVAL;
4433                 break;
4434             }
4435             decompress_data_with_multi_threads(f, host, len);
4436             break;
4437
4438         case RAM_SAVE_FLAG_XBZRLE:
4439             if (load_xbzrle(f, addr, host) < 0) {
4440                 error_report("Failed to decompress XBZRLE page at "
4441                              RAM_ADDR_FMT, addr);
4442                 ret = -EINVAL;
4443                 break;
4444             }
4445             break;
4446         case RAM_SAVE_FLAG_EOS:
4447             /* normal exit */
4448             multifd_recv_sync_main();
4449             break;
4450         default:
4451             if (flags & RAM_SAVE_FLAG_HOOK) {
4452                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4453             } else {
4454                 error_report("Unknown combination of migration flags: %#x",
4455                              flags);
4456                 ret = -EINVAL;
4457             }
4458         }
4459         if (!ret) {
4460             ret = qemu_file_get_error(f);
4461         }
4462     }
4463
4464     return ret;
4465 }
4466
4467 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4468 {
4469     int ret = 0;
4470     static uint64_t seq_iter;
4471     /*
4472      * If system is running in postcopy mode, page inserts to host memory must
4473      * be atomic
4474      */
4475     bool postcopy_running = postcopy_is_running();
4476
4477     seq_iter++;
4478
4479     if (version_id != 4) {
4480         return -EINVAL;
4481     }
4482
4483     /*
4484      * This RCU critical section can be very long running.
4485      * When RCU reclaims in the code start to become numerous,
4486      * it will be necessary to reduce the granularity of this
4487      * critical section.
4488      */
4489     rcu_read_lock();
4490
4491     if (postcopy_running) {
4492         ret = ram_load_postcopy(f);
4493     } else {
4494         ret = ram_load_precopy(f);
4495     }
4496
4497     ret |= wait_for_decompress_done();
4498     rcu_read_unlock();
4499     trace_ram_load_complete(ret, seq_iter);
4500
4501     if (!ret  && migration_incoming_in_colo_state()) {
4502         colo_flush_ram_cache();
4503     }
4504     return ret;
4505 }
4506
4507 static bool ram_has_postcopy(void *opaque)
4508 {
4509     RAMBlock *rb;
4510     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4511         if (ramblock_is_pmem(rb)) {
4512             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4513                          "is not supported now!", rb->idstr, rb->host);
4514             return false;
4515         }
4516     }
4517
4518     return migrate_postcopy_ram();
4519 }
4520
4521 /* Sync all the dirty bitmap with destination VM.  */
4522 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4523 {
4524     RAMBlock *block;
4525     QEMUFile *file = s->to_dst_file;
4526     int ramblock_count = 0;
4527
4528     trace_ram_dirty_bitmap_sync_start();
4529
4530     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4531         qemu_savevm_send_recv_bitmap(file, block->idstr);
4532         trace_ram_dirty_bitmap_request(block->idstr);
4533         ramblock_count++;
4534     }
4535
4536     trace_ram_dirty_bitmap_sync_wait();
4537
4538     /* Wait until all the ramblocks' dirty bitmap synced */
4539     while (ramblock_count--) {
4540         qemu_sem_wait(&s->rp_state.rp_sem);
4541     }
4542
4543     trace_ram_dirty_bitmap_sync_complete();
4544
4545     return 0;
4546 }
4547
4548 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4549 {
4550     qemu_sem_post(&s->rp_state.rp_sem);
4551 }
4552
4553 /*
4554  * Read the received bitmap, revert it as the initial dirty bitmap.
4555  * This is only used when the postcopy migration is paused but wants
4556  * to resume from a middle point.
4557  */
4558 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4559 {
4560     int ret = -EINVAL;
4561     QEMUFile *file = s->rp_state.from_dst_file;
4562     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4563     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4564     uint64_t size, end_mark;
4565
4566     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4567
4568     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4569         error_report("%s: incorrect state %s", __func__,
4570                      MigrationStatus_str(s->state));
4571         return -EINVAL;
4572     }
4573
4574     /*
4575      * Note: see comments in ramblock_recv_bitmap_send() on why we
4576      * need the endianess convertion, and the paddings.
4577      */
4578     local_size = ROUND_UP(local_size, 8);
4579
4580     /* Add paddings */
4581     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4582
4583     size = qemu_get_be64(file);
4584
4585     /* The size of the bitmap should match with our ramblock */
4586     if (size != local_size) {
4587         error_report("%s: ramblock '%s' bitmap size mismatch "
4588                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4589                      block->idstr, size, local_size);
4590         ret = -EINVAL;
4591         goto out;
4592     }
4593
4594     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4595     end_mark = qemu_get_be64(file);
4596
4597     ret = qemu_file_get_error(file);
4598     if (ret || size != local_size) {
4599         error_report("%s: read bitmap failed for ramblock '%s': %d"
4600                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4601                      __func__, block->idstr, ret, local_size, size);
4602         ret = -EIO;
4603         goto out;
4604     }
4605
4606     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4607         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4608                      __func__, block->idstr, end_mark);
4609         ret = -EINVAL;
4610         goto out;
4611     }
4612
4613     /*
4614      * Endianess convertion. We are during postcopy (though paused).
4615      * The dirty bitmap won't change. We can directly modify it.
4616      */
4617     bitmap_from_le(block->bmap, le_bitmap, nbits);
4618
4619     /*
4620      * What we received is "received bitmap". Revert it as the initial
4621      * dirty bitmap for this ramblock.
4622      */
4623     bitmap_complement(block->bmap, block->bmap, nbits);
4624
4625     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4626
4627     /*
4628      * We succeeded to sync bitmap for current ramblock. If this is
4629      * the last one to sync, we need to notify the main send thread.
4630      */
4631     ram_dirty_bitmap_reload_notify(s);
4632
4633     ret = 0;
4634 out:
4635     g_free(le_bitmap);
4636     return ret;
4637 }
4638
4639 static int ram_resume_prepare(MigrationState *s, void *opaque)
4640 {
4641     RAMState *rs = *(RAMState **)opaque;
4642     int ret;
4643
4644     ret = ram_dirty_bitmap_sync_all(s, rs);
4645     if (ret) {
4646         return ret;
4647     }
4648
4649     ram_state_resume_prepare(rs, s->to_dst_file);
4650
4651     return 0;
4652 }
4653
4654 static SaveVMHandlers savevm_ram_handlers = {
4655     .save_setup = ram_save_setup,
4656     .save_live_iterate = ram_save_iterate,
4657     .save_live_complete_postcopy = ram_save_complete,
4658     .save_live_complete_precopy = ram_save_complete,
4659     .has_postcopy = ram_has_postcopy,
4660     .save_live_pending = ram_save_pending,
4661     .load_state = ram_load,
4662     .save_cleanup = ram_save_cleanup,
4663     .load_setup = ram_load_setup,
4664     .load_cleanup = ram_load_cleanup,
4665     .resume_prepare = ram_resume_prepare,
4666 };
4667
4668 void ram_mig_init(void)
4669 {
4670     qemu_mutex_init(&XBZRLE.lock);
4671     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4672 }