migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <[email protected]>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63  * worked for pages that where filled with the same char.  We switched
  64  * it to only search for the zero value.  And to avoid confusion with
  65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66  */
  67
  68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69 #define RAM_SAVE_FLAG_ZERO     0x02
  70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71 #define RAM_SAVE_FLAG_PAGE     0x08
  72 #define RAM_SAVE_FLAG_EOS      0x10
  73 #define RAM_SAVE_FLAG_CONTINUE 0x20
  74 #define RAM_SAVE_FLAG_XBZRLE   0x40
  75 /* 0x80 is reserved in migration.h start with 0x100 next */
  76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79 {
  80     return buffer_is_zero(p, size);
  81 }
  82
  83 XBZRLECacheStats xbzrle_counters;
  84
  85 /* struct contains XBZRLE cache and a static page
  86    used by the compression */
  87 static struct {
  88     /* buffer used for XBZRLE encoding */
  89     uint8_t *encoded_buf;
  90     /* buffer for storing page content */
  91     uint8_t *current_buf;
  92     /* Cache for XBZRLE, Protected by lock. */
  93     PageCache *cache;
  94     QemuMutex lock;
  95     /* it will store a page full of zeros */
  96     uint8_t *zero_target_page;
  97     /* buffer used for XBZRLE decoding */
  98     uint8_t *decoded_buf;
  99 } XBZRLE;
 100
 101 static void XBZRLE_cache_lock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_lock(&XBZRLE.lock);
 105 }
 106
 107 static void XBZRLE_cache_unlock(void)
 108 {
 109     if (migrate_use_xbzrle())
 110         qemu_mutex_unlock(&XBZRLE.lock);
 111 }
 112
 113 /**
 114  * xbzrle_cache_resize: resize the xbzrle cache
 115  *
 116  * This function is called from qmp_migrate_set_cache_size in main
 117  * thread, possibly while a migration is in progress.  A running
 118  * migration may be using the cache and might finish during this call,
 119  * hence changes to the cache are protected by XBZRLE.lock().
 120  *
 121  * Returns 0 for success or -1 for error
 122  *
 123  * @new_size: new cache size
 124  * @errp: set *errp if the check failed, with reason
 125  */
 126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127 {
 128     PageCache *new_cache;
 129     int64_t ret = 0;
 130
 131     /* Check for truncation */
 132     if (new_size != (size_t)new_size) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeding address space");
 135         return -1;
 136     }
 137
 138     if (new_size == migrate_xbzrle_cache_size()) {
 139         /* nothing to do */
 140         return 0;
 141     }
 142
 143     XBZRLE_cache_lock();
 144
 145     if (XBZRLE.cache != NULL) {
 146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147         if (!new_cache) {
 148             ret = -1;
 149             goto out;
 150         }
 151
 152         cache_fini(XBZRLE.cache);
 153         XBZRLE.cache = new_cache;
 154     }
 155 out:
 156     XBZRLE_cache_unlock();
 157     return ret;
 158 }
 159
 160 /* Should be holding either ram_list.mutex, or the RCU lock. */
 161 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 162     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 163         if (!qemu_ram_is_migratable(block)) {} else
 164
 165 #undef RAMBLOCK_FOREACH
 166
 167 static void ramblock_recv_map_init(void)
 168 {
 169     RAMBlock *rb;
 170
 171     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 172         assert(!rb->receivedmap);
 173         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 174     }
 175 }
 176
 177 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 178 {
 179     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 180                     rb->receivedmap);
 181 }
 182
 183 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 184 {
 185     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 186 }
 187
 188 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 189 {
 190     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 191 }
 192
 193 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 194                                     size_t nr)
 195 {
 196     bitmap_set_atomic(rb->receivedmap,
 197                       ramblock_recv_bitmap_offset(host_addr, rb),
 198                       nr);
 199 }
 200
 201 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 202
 203 /*
 204  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 205  *
 206  * Returns >0 if success with sent bytes, or <0 if error.
 207  */
 208 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 209                                   const char *block_name)
 210 {
 211     RAMBlock *block = qemu_ram_block_by_name(block_name);
 212     unsigned long *le_bitmap, nbits;
 213     uint64_t size;
 214
 215     if (!block) {
 216         error_report("%s: invalid block name: %s", __func__, block_name);
 217         return -1;
 218     }
 219
 220     nbits = block->used_length >> TARGET_PAGE_BITS;
 221
 222     /*
 223      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 224      * machines we may need 4 more bytes for padding (see below
 225      * comment). So extend it a bit before hand.
 226      */
 227     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 228
 229     /*
 230      * Always use little endian when sending the bitmap. This is
 231      * required that when source and destination VMs are not using the
 232      * same endianess. (Note: big endian won't work.)
 233      */
 234     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 235
 236     /* Size of the bitmap, in bytes */
 237     size = nbits / 8;
 238
 239     /*
 240      * size is always aligned to 8 bytes for 64bit machines, but it
 241      * may not be true for 32bit machines. We need this padding to
 242      * make sure the migration can survive even between 32bit and
 243      * 64bit machines.
 244      */
 245     size = ROUND_UP(size, 8);
 246
 247     qemu_put_be64(file, size);
 248     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 249     /*
 250      * Mark as an end, in case the middle part is screwed up due to
 251      * some "misterious" reason.
 252      */
 253     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 254     qemu_fflush(file);
 255
 256     g_free(le_bitmap);
 257
 258     if (qemu_file_get_error(file)) {
 259         return qemu_file_get_error(file);
 260     }
 261
 262     return size + sizeof(size);
 263 }
 264
 265 /*
 266  * An outstanding page request, on the source, having been received
 267  * and queued
 268  */
 269 struct RAMSrcPageRequest {
 270     RAMBlock *rb;
 271     hwaddr    offset;
 272     hwaddr    len;
 273
 274     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 275 };
 276
 277 /* State of RAM for migration */
 278 struct RAMState {
 279     /* QEMUFile used for this migration */
 280     QEMUFile *f;
 281     /* Last block that we have visited searching for dirty pages */
 282     RAMBlock *last_seen_block;
 283     /* Last block from where we have sent data */
 284     RAMBlock *last_sent_block;
 285     /* Last dirty target page we have sent */
 286     ram_addr_t last_page;
 287     /* last ram version we have seen */
 288     uint32_t last_version;
 289     /* We are in the first round */
 290     bool ram_bulk_stage;
 291     /* How many times we have dirty too many pages */
 292     int dirty_rate_high_cnt;
 293     /* these variables are used for bitmap sync */
 294     /* last time we did a full bitmap_sync */
 295     int64_t time_last_bitmap_sync;
 296     /* bytes transferred at start_time */
 297     uint64_t bytes_xfer_prev;
 298     /* number of dirty pages since start_time */
 299     uint64_t num_dirty_pages_period;
 300     /* xbzrle misses since the beginning of the period */
 301     uint64_t xbzrle_cache_miss_prev;
 302     /* number of iterations at the beginning of period */
 303     uint64_t iterations_prev;
 304     /* Iterations since start */
 305     uint64_t iterations;
 306     /* number of dirty bits in the bitmap */
 307     uint64_t migration_dirty_pages;
 308     /* protects modification of the bitmap */
 309     QemuMutex bitmap_mutex;
 310     /* The RAMBlock used in the last src_page_requests */
 311     RAMBlock *last_req_rb;
 312     /* Queue of outstanding page requests from the destination */
 313     QemuMutex src_page_req_mutex;
 314     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 315 };
 316 typedef struct RAMState RAMState;
 317
 318 static RAMState *ram_state;
 319
 320 uint64_t ram_bytes_remaining(void)
 321 {
 322     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 323                        0;
 324 }
 325
 326 MigrationStats ram_counters;
 327
 328 /* used by the search for pages to send */
 329 struct PageSearchStatus {
 330     /* Current block being searched */
 331     RAMBlock    *block;
 332     /* Current page to search from */
 333     unsigned long page;
 334     /* Set once we wrap around */
 335     bool         complete_round;
 336 };
 337 typedef struct PageSearchStatus PageSearchStatus;
 338
 339 struct CompressParam {
 340     bool done;
 341     bool quit;
 342     QEMUFile *file;
 343     QemuMutex mutex;
 344     QemuCond cond;
 345     RAMBlock *block;
 346     ram_addr_t offset;
 347
 348     /* internally used fields */
 349     z_stream stream;
 350     uint8_t *originbuf;
 351 };
 352 typedef struct CompressParam CompressParam;
 353
 354 struct DecompressParam {
 355     bool done;
 356     bool quit;
 357     QemuMutex mutex;
 358     QemuCond cond;
 359     void *des;
 360     uint8_t *compbuf;
 361     int len;
 362     z_stream stream;
 363 };
 364 typedef struct DecompressParam DecompressParam;
 365
 366 static CompressParam *comp_param;
 367 static QemuThread *compress_threads;
 368 /* comp_done_cond is used to wake up the migration thread when
 369  * one of the compression threads has finished the compression.
 370  * comp_done_lock is used to co-work with comp_done_cond.
 371  */
 372 static QemuMutex comp_done_lock;
 373 static QemuCond comp_done_cond;
 374 /* The empty QEMUFileOps will be used by file in CompressParam */
 375 static const QEMUFileOps empty_ops = { };
 376
 377 static QEMUFile *decomp_file;
 378 static DecompressParam *decomp_param;
 379 static QemuThread *decompress_threads;
 380 static QemuMutex decomp_done_lock;
 381 static QemuCond decomp_done_cond;
 382
 383 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 384                                 ram_addr_t offset, uint8_t *source_buf);
 385
 386 static void *do_data_compress(void *opaque)
 387 {
 388     CompressParam *param = opaque;
 389     RAMBlock *block;
 390     ram_addr_t offset;
 391
 392     qemu_mutex_lock(&param->mutex);
 393     while (!param->quit) {
 394         if (param->block) {
 395             block = param->block;
 396             offset = param->offset;
 397             param->block = NULL;
 398             qemu_mutex_unlock(&param->mutex);
 399
 400             do_compress_ram_page(param->file, &param->stream, block, offset,
 401                                  param->originbuf);
 402
 403             qemu_mutex_lock(&comp_done_lock);
 404             param->done = true;
 405             qemu_cond_signal(&comp_done_cond);
 406             qemu_mutex_unlock(&comp_done_lock);
 407
 408             qemu_mutex_lock(&param->mutex);
 409         } else {
 410             qemu_cond_wait(&param->cond, &param->mutex);
 411         }
 412     }
 413     qemu_mutex_unlock(&param->mutex);
 414
 415     return NULL;
 416 }
 417
 418 static inline void terminate_compression_threads(void)
 419 {
 420     int idx, thread_count;
 421
 422     thread_count = migrate_compress_threads();
 423
 424     for (idx = 0; idx < thread_count; idx++) {
 425         qemu_mutex_lock(&comp_param[idx].mutex);
 426         comp_param[idx].quit = true;
 427         qemu_cond_signal(&comp_param[idx].cond);
 428         qemu_mutex_unlock(&comp_param[idx].mutex);
 429     }
 430 }
 431
 432 static void compress_threads_save_cleanup(void)
 433 {
 434     int i, thread_count;
 435
 436     if (!migrate_use_compression()) {
 437         return;
 438     }
 439     terminate_compression_threads();
 440     thread_count = migrate_compress_threads();
 441     for (i = 0; i < thread_count; i++) {
 442         /*
 443          * we use it as a indicator which shows if the thread is
 444          * properly init'd or not
 445          */
 446         if (!comp_param[i].file) {
 447             break;
 448         }
 449         qemu_thread_join(compress_threads + i);
 450         qemu_mutex_destroy(&comp_param[i].mutex);
 451         qemu_cond_destroy(&comp_param[i].cond);
 452         deflateEnd(&comp_param[i].stream);
 453         g_free(comp_param[i].originbuf);
 454         qemu_fclose(comp_param[i].file);
 455         comp_param[i].file = NULL;
 456     }
 457     qemu_mutex_destroy(&comp_done_lock);
 458     qemu_cond_destroy(&comp_done_cond);
 459     g_free(compress_threads);
 460     g_free(comp_param);
 461     compress_threads = NULL;
 462     comp_param = NULL;
 463 }
 464
 465 static int compress_threads_save_setup(void)
 466 {
 467     int i, thread_count;
 468
 469     if (!migrate_use_compression()) {
 470         return 0;
 471     }
 472     thread_count = migrate_compress_threads();
 473     compress_threads = g_new0(QemuThread, thread_count);
 474     comp_param = g_new0(CompressParam, thread_count);
 475     qemu_cond_init(&comp_done_cond);
 476     qemu_mutex_init(&comp_done_lock);
 477     for (i = 0; i < thread_count; i++) {
 478         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 479         if (!comp_param[i].originbuf) {
 480             goto exit;
 481         }
 482
 483         if (deflateInit(&comp_param[i].stream,
 484                         migrate_compress_level()) != Z_OK) {
 485             g_free(comp_param[i].originbuf);
 486             goto exit;
 487         }
 488
 489         /* comp_param[i].file is just used as a dummy buffer to save data,
 490          * set its ops to empty.
 491          */
 492         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 493         comp_param[i].done = true;
 494         comp_param[i].quit = false;
 495         qemu_mutex_init(&comp_param[i].mutex);
 496         qemu_cond_init(&comp_param[i].cond);
 497         qemu_thread_create(compress_threads + i, "compress",
 498                            do_data_compress, comp_param + i,
 499                            QEMU_THREAD_JOINABLE);
 500     }
 501     return 0;
 502
 503 exit:
 504     compress_threads_save_cleanup();
 505     return -1;
 506 }
 507
 508 /* Multiple fd's */
 509
 510 #define MULTIFD_MAGIC 0x11223344U
 511 #define MULTIFD_VERSION 1
 512
 513 #define MULTIFD_FLAG_SYNC (1 << 0)
 514
 515 typedef struct {
 516     uint32_t magic;
 517     uint32_t version;
 518     unsigned char uuid[16]; /* QemuUUID */
 519     uint8_t id;
 520 } __attribute__((packed)) MultiFDInit_t;
 521
 522 typedef struct {
 523     uint32_t magic;
 524     uint32_t version;
 525     uint32_t flags;
 526     uint32_t size;
 527     uint32_t used;
 528     uint64_t packet_num;
 529     char ramblock[256];
 530     uint64_t offset[];
 531 } __attribute__((packed)) MultiFDPacket_t;
 532
 533 typedef struct {
 534     /* number of used pages */
 535     uint32_t used;
 536     /* number of allocated pages */
 537     uint32_t allocated;
 538     /* global number of generated multifd packets */
 539     uint64_t packet_num;
 540     /* offset of each page */
 541     ram_addr_t *offset;
 542     /* pointer to each page */
 543     struct iovec *iov;
 544     RAMBlock *block;
 545 } MultiFDPages_t;
 546
 547 typedef struct {
 548     /* this fields are not changed once the thread is created */
 549     /* channel number */
 550     uint8_t id;
 551     /* channel thread name */
 552     char *name;
 553     /* channel thread id */
 554     QemuThread thread;
 555     /* communication channel */
 556     QIOChannel *c;
 557     /* sem where to wait for more work */
 558     QemuSemaphore sem;
 559     /* this mutex protects the following parameters */
 560     QemuMutex mutex;
 561     /* is this channel thread running */
 562     bool running;
 563     /* should this thread finish */
 564     bool quit;
 565     /* thread has work to do */
 566     int pending_job;
 567     /* array of pages to sent */
 568     MultiFDPages_t *pages;
 569     /* packet allocated len */
 570     uint32_t packet_len;
 571     /* pointer to the packet */
 572     MultiFDPacket_t *packet;
 573     /* multifd flags for each packet */
 574     uint32_t flags;
 575     /* global number of generated multifd packets */
 576     uint64_t packet_num;
 577     /* thread local variables */
 578     /* packets sent through this channel */
 579     uint64_t num_packets;
 580     /* pages sent through this channel */
 581     uint64_t num_pages;
 582     /* syncs main thread and channels */
 583     QemuSemaphore sem_sync;
 584 }  MultiFDSendParams;
 585
 586 typedef struct {
 587     /* this fields are not changed once the thread is created */
 588     /* channel number */
 589     uint8_t id;
 590     /* channel thread name */
 591     char *name;
 592     /* channel thread id */
 593     QemuThread thread;
 594     /* communication channel */
 595     QIOChannel *c;
 596     /* sem where to wait for more work */
 597     QemuSemaphore sem;
 598     /* this mutex protects the following parameters */
 599     QemuMutex mutex;
 600     /* is this channel thread running */
 601     bool running;
 602     /* should this thread finish */
 603     bool quit;
 604     /* thread has work to do */
 605     bool pending_job;
 606     /* array of pages to receive */
 607     MultiFDPages_t *pages;
 608     /* packet allocated len */
 609     uint32_t packet_len;
 610     /* pointer to the packet */
 611     MultiFDPacket_t *packet;
 612     /* multifd flags for each packet */
 613     uint32_t flags;
 614     /* global number of generated multifd packets */
 615     uint64_t packet_num;
 616     /* thread local variables */
 617     /* packets sent through this channel */
 618     uint64_t num_packets;
 619     /* pages sent through this channel */
 620     uint64_t num_pages;
 621     /* syncs main thread and channels */
 622     QemuSemaphore sem_sync;
 623 } MultiFDRecvParams;
 624
 625 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 626 {
 627     MultiFDInit_t msg;
 628     int ret;
 629
 630     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 631     msg.version = cpu_to_be32(MULTIFD_VERSION);
 632     msg.id = p->id;
 633     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 634
 635     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 636     if (ret != 0) {
 637         return -1;
 638     }
 639     return 0;
 640 }
 641
 642 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 643 {
 644     MultiFDInit_t msg;
 645     int ret;
 646
 647     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 648     if (ret != 0) {
 649         return -1;
 650     }
 651
 652     be32_to_cpus(&msg.magic);
 653     be32_to_cpus(&msg.version);
 654
 655     if (msg.magic != MULTIFD_MAGIC) {
 656         error_setg(errp, "multifd: received packet magic %x "
 657                    "expected %x", msg.magic, MULTIFD_MAGIC);
 658         return -1;
 659     }
 660
 661     if (msg.version != MULTIFD_VERSION) {
 662         error_setg(errp, "multifd: received packet version %d "
 663                    "expected %d", msg.version, MULTIFD_VERSION);
 664         return -1;
 665     }
 666
 667     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 668         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 669         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 670
 671         error_setg(errp, "multifd: received uuid '%s' and expected "
 672                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 673         g_free(uuid);
 674         g_free(msg_uuid);
 675         return -1;
 676     }
 677
 678     if (msg.id > migrate_multifd_channels()) {
 679         error_setg(errp, "multifd: received channel version %d "
 680                    "expected %d", msg.version, MULTIFD_VERSION);
 681         return -1;
 682     }
 683
 684     return msg.id;
 685 }
 686
 687 static MultiFDPages_t *multifd_pages_init(size_t size)
 688 {
 689     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 690
 691     pages->allocated = size;
 692     pages->iov = g_new0(struct iovec, size);
 693     pages->offset = g_new0(ram_addr_t, size);
 694
 695     return pages;
 696 }
 697
 698 static void multifd_pages_clear(MultiFDPages_t *pages)
 699 {
 700     pages->used = 0;
 701     pages->allocated = 0;
 702     pages->packet_num = 0;
 703     pages->block = NULL;
 704     g_free(pages->iov);
 705     pages->iov = NULL;
 706     g_free(pages->offset);
 707     pages->offset = NULL;
 708     g_free(pages);
 709 }
 710
 711 static void multifd_send_fill_packet(MultiFDSendParams *p)
 712 {
 713     MultiFDPacket_t *packet = p->packet;
 714     int i;
 715
 716     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 717     packet->version = cpu_to_be32(MULTIFD_VERSION);
 718     packet->flags = cpu_to_be32(p->flags);
 719     packet->size = cpu_to_be32(migrate_multifd_page_count());
 720     packet->used = cpu_to_be32(p->pages->used);
 721     packet->packet_num = cpu_to_be64(p->packet_num);
 722
 723     if (p->pages->block) {
 724         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 725     }
 726
 727     for (i = 0; i < p->pages->used; i++) {
 728         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 729     }
 730 }
 731
 732 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 733 {
 734     MultiFDPacket_t *packet = p->packet;
 735     RAMBlock *block;
 736     int i;
 737
 738     /* ToDo: We can't use it until we haven't received a message */
 739     return 0;
 740
 741     be32_to_cpus(&packet->magic);
 742     if (packet->magic != MULTIFD_MAGIC) {
 743         error_setg(errp, "multifd: received packet "
 744                    "magic %x and expected magic %x",
 745                    packet->magic, MULTIFD_MAGIC);
 746         return -1;
 747     }
 748
 749     be32_to_cpus(&packet->version);
 750     if (packet->version != MULTIFD_VERSION) {
 751         error_setg(errp, "multifd: received packet "
 752                    "version %d and expected version %d",
 753                    packet->version, MULTIFD_VERSION);
 754         return -1;
 755     }
 756
 757     p->flags = be32_to_cpu(packet->flags);
 758
 759     be32_to_cpus(&packet->size);
 760     if (packet->size > migrate_multifd_page_count()) {
 761         error_setg(errp, "multifd: received packet "
 762                    "with size %d and expected maximum size %d",
 763                    packet->size, migrate_multifd_page_count()) ;
 764         return -1;
 765     }
 766
 767     p->pages->used = be32_to_cpu(packet->used);
 768     if (p->pages->used > packet->size) {
 769         error_setg(errp, "multifd: received packet "
 770                    "with size %d and expected maximum size %d",
 771                    p->pages->used, packet->size) ;
 772         return -1;
 773     }
 774
 775     p->packet_num = be64_to_cpu(packet->packet_num);
 776
 777     if (p->pages->used) {
 778         /* make sure that ramblock is 0 terminated */
 779         packet->ramblock[255] = 0;
 780         block = qemu_ram_block_by_name(packet->ramblock);
 781         if (!block) {
 782             error_setg(errp, "multifd: unknown ram block %s",
 783                        packet->ramblock);
 784             return -1;
 785         }
 786     }
 787
 788     for (i = 0; i < p->pages->used; i++) {
 789         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 790
 791         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 792             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 793                        " (max " RAM_ADDR_FMT ")",
 794                        offset, block->max_length);
 795             return -1;
 796         }
 797         p->pages->iov[i].iov_base = block->host + offset;
 798         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 799     }
 800
 801     return 0;
 802 }
 803
 804 struct {
 805     MultiFDSendParams *params;
 806     /* number of created threads */
 807     int count;
 808     /* array of pages to sent */
 809     MultiFDPages_t *pages;
 810     /* syncs main thread and channels */
 811     QemuSemaphore sem_sync;
 812     /* global number of generated multifd packets */
 813     uint64_t packet_num;
 814 } *multifd_send_state;
 815
 816 static void multifd_send_terminate_threads(Error *err)
 817 {
 818     int i;
 819
 820     if (err) {
 821         MigrationState *s = migrate_get_current();
 822         migrate_set_error(s, err);
 823         if (s->state == MIGRATION_STATUS_SETUP ||
 824             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 825             s->state == MIGRATION_STATUS_DEVICE ||
 826             s->state == MIGRATION_STATUS_ACTIVE) {
 827             migrate_set_state(&s->state, s->state,
 828                               MIGRATION_STATUS_FAILED);
 829         }
 830     }
 831
 832     for (i = 0; i < migrate_multifd_channels(); i++) {
 833         MultiFDSendParams *p = &multifd_send_state->params[i];
 834
 835         qemu_mutex_lock(&p->mutex);
 836         p->quit = true;
 837         qemu_sem_post(&p->sem);
 838         qemu_mutex_unlock(&p->mutex);
 839     }
 840 }
 841
 842 int multifd_save_cleanup(Error **errp)
 843 {
 844     int i;
 845     int ret = 0;
 846
 847     if (!migrate_use_multifd()) {
 848         return 0;
 849     }
 850     multifd_send_terminate_threads(NULL);
 851     for (i = 0; i < migrate_multifd_channels(); i++) {
 852         MultiFDSendParams *p = &multifd_send_state->params[i];
 853
 854         if (p->running) {
 855             qemu_thread_join(&p->thread);
 856         }
 857         socket_send_channel_destroy(p->c);
 858         p->c = NULL;
 859         qemu_mutex_destroy(&p->mutex);
 860         qemu_sem_destroy(&p->sem);
 861         qemu_sem_destroy(&p->sem_sync);
 862         g_free(p->name);
 863         p->name = NULL;
 864         multifd_pages_clear(p->pages);
 865         p->pages = NULL;
 866         p->packet_len = 0;
 867         g_free(p->packet);
 868         p->packet = NULL;
 869     }
 870     qemu_sem_destroy(&multifd_send_state->sem_sync);
 871     g_free(multifd_send_state->params);
 872     multifd_send_state->params = NULL;
 873     multifd_pages_clear(multifd_send_state->pages);
 874     multifd_send_state->pages = NULL;
 875     g_free(multifd_send_state);
 876     multifd_send_state = NULL;
 877     return ret;
 878 }
 879
 880 static void multifd_send_sync_main(void)
 881 {
 882     int i;
 883
 884     if (!migrate_use_multifd()) {
 885         return;
 886     }
 887     for (i = 0; i < migrate_multifd_channels(); i++) {
 888         MultiFDSendParams *p = &multifd_send_state->params[i];
 889
 890         trace_multifd_send_sync_main_signal(p->id);
 891
 892         qemu_mutex_lock(&p->mutex);
 893         p->flags |= MULTIFD_FLAG_SYNC;
 894         p->pending_job++;
 895         qemu_mutex_unlock(&p->mutex);
 896         qemu_sem_post(&p->sem);
 897     }
 898     for (i = 0; i < migrate_multifd_channels(); i++) {
 899         MultiFDSendParams *p = &multifd_send_state->params[i];
 900
 901         trace_multifd_send_sync_main_wait(p->id);
 902         qemu_sem_wait(&multifd_send_state->sem_sync);
 903     }
 904     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 905 }
 906
 907 static void *multifd_send_thread(void *opaque)
 908 {
 909     MultiFDSendParams *p = opaque;
 910     Error *local_err = NULL;
 911
 912     trace_multifd_send_thread_start(p->id);
 913
 914     if (multifd_send_initial_packet(p, &local_err) < 0) {
 915         goto out;
 916     }
 917     /* initial packet */
 918     p->num_packets = 1;
 919
 920     while (true) {
 921         qemu_sem_wait(&p->sem);
 922         qemu_mutex_lock(&p->mutex);
 923
 924         if (p->pending_job) {
 925             uint32_t used = p->pages->used;
 926             uint64_t packet_num = p->packet_num;
 927             uint32_t flags = p->flags;
 928
 929             multifd_send_fill_packet(p);
 930             p->flags = 0;
 931             p->num_packets++;
 932             p->num_pages += used;
 933             p->pages->used = 0;
 934             qemu_mutex_unlock(&p->mutex);
 935
 936             trace_multifd_send(p->id, packet_num, used, flags);
 937
 938             /* ToDo: send packet here */
 939
 940             qemu_mutex_lock(&p->mutex);
 941             p->pending_job--;
 942             qemu_mutex_unlock(&p->mutex);
 943
 944             if (flags & MULTIFD_FLAG_SYNC) {
 945                 qemu_sem_post(&multifd_send_state->sem_sync);
 946             }
 947         } else if (p->quit) {
 948             qemu_mutex_unlock(&p->mutex);
 949             break;
 950         } else {
 951             qemu_mutex_unlock(&p->mutex);
 952             /* sometimes there are spurious wakeups */
 953         }
 954     }
 955
 956 out:
 957     if (local_err) {
 958         multifd_send_terminate_threads(local_err);
 959     }
 960
 961     qemu_mutex_lock(&p->mutex);
 962     p->running = false;
 963     qemu_mutex_unlock(&p->mutex);
 964
 965     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
 966
 967     return NULL;
 968 }
 969
 970 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
 971 {
 972     MultiFDSendParams *p = opaque;
 973     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
 974     Error *local_err = NULL;
 975
 976     if (qio_task_propagate_error(task, &local_err)) {
 977         if (multifd_save_cleanup(&local_err) != 0) {
 978             migrate_set_error(migrate_get_current(), local_err);
 979         }
 980     } else {
 981         p->c = QIO_CHANNEL(sioc);
 982         qio_channel_set_delay(p->c, false);
 983         p->running = true;
 984         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 985                            QEMU_THREAD_JOINABLE);
 986
 987         atomic_inc(&multifd_send_state->count);
 988     }
 989 }
 990
 991 int multifd_save_setup(void)
 992 {
 993     int thread_count;
 994     uint32_t page_count = migrate_multifd_page_count();
 995     uint8_t i;
 996
 997     if (!migrate_use_multifd()) {
 998         return 0;
 999     }
1000     thread_count = migrate_multifd_channels();
1001     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1002     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1003     atomic_set(&multifd_send_state->count, 0);
1004     multifd_send_state->pages = multifd_pages_init(page_count);
1005     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1006
1007     for (i = 0; i < thread_count; i++) {
1008         MultiFDSendParams *p = &multifd_send_state->params[i];
1009
1010         qemu_mutex_init(&p->mutex);
1011         qemu_sem_init(&p->sem, 0);
1012         qemu_sem_init(&p->sem_sync, 0);
1013         p->quit = false;
1014         p->pending_job = 0;
1015         p->id = i;
1016         p->pages = multifd_pages_init(page_count);
1017         p->packet_len = sizeof(MultiFDPacket_t)
1018                       + sizeof(ram_addr_t) * page_count;
1019         p->packet = g_malloc0(p->packet_len);
1020         p->name = g_strdup_printf("multifdsend_%d", i);
1021         socket_send_channel_create(multifd_new_send_channel_async, p);
1022     }
1023     return 0;
1024 }
1025
1026 struct {
1027     MultiFDRecvParams *params;
1028     /* number of created threads */
1029     int count;
1030     /* syncs main thread and channels */
1031     QemuSemaphore sem_sync;
1032     /* global number of generated multifd packets */
1033     uint64_t packet_num;
1034 } *multifd_recv_state;
1035
1036 static void multifd_recv_terminate_threads(Error *err)
1037 {
1038     int i;
1039
1040     if (err) {
1041         MigrationState *s = migrate_get_current();
1042         migrate_set_error(s, err);
1043         if (s->state == MIGRATION_STATUS_SETUP ||
1044             s->state == MIGRATION_STATUS_ACTIVE) {
1045             migrate_set_state(&s->state, s->state,
1046                               MIGRATION_STATUS_FAILED);
1047         }
1048     }
1049
1050     for (i = 0; i < migrate_multifd_channels(); i++) {
1051         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1052
1053         qemu_mutex_lock(&p->mutex);
1054         p->quit = true;
1055         qemu_sem_post(&p->sem);
1056         qemu_mutex_unlock(&p->mutex);
1057     }
1058 }
1059
1060 int multifd_load_cleanup(Error **errp)
1061 {
1062     int i;
1063     int ret = 0;
1064
1065     if (!migrate_use_multifd()) {
1066         return 0;
1067     }
1068     multifd_recv_terminate_threads(NULL);
1069     for (i = 0; i < migrate_multifd_channels(); i++) {
1070         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1071
1072         if (p->running) {
1073             qemu_thread_join(&p->thread);
1074         }
1075         object_unref(OBJECT(p->c));
1076         p->c = NULL;
1077         qemu_mutex_destroy(&p->mutex);
1078         qemu_sem_destroy(&p->sem);
1079         qemu_sem_destroy(&p->sem_sync);
1080         g_free(p->name);
1081         p->name = NULL;
1082         multifd_pages_clear(p->pages);
1083         p->pages = NULL;
1084         p->packet_len = 0;
1085         g_free(p->packet);
1086         p->packet = NULL;
1087     }
1088     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1089     g_free(multifd_recv_state->params);
1090     multifd_recv_state->params = NULL;
1091     g_free(multifd_recv_state);
1092     multifd_recv_state = NULL;
1093
1094     return ret;
1095 }
1096
1097 static void multifd_recv_sync_main(void)
1098 {
1099     int i;
1100
1101     if (!migrate_use_multifd()) {
1102         return;
1103     }
1104     for (i = 0; i < migrate_multifd_channels(); i++) {
1105         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1106
1107         trace_multifd_recv_sync_main_signal(p->id);
1108         qemu_mutex_lock(&p->mutex);
1109         p->pending_job = true;
1110         qemu_mutex_unlock(&p->mutex);
1111     }
1112     for (i = 0; i < migrate_multifd_channels(); i++) {
1113         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1114
1115         trace_multifd_recv_sync_main_wait(p->id);
1116         qemu_sem_wait(&multifd_recv_state->sem_sync);
1117         qemu_mutex_lock(&p->mutex);
1118         if (multifd_recv_state->packet_num < p->packet_num) {
1119             multifd_recv_state->packet_num = p->packet_num;
1120         }
1121         qemu_mutex_unlock(&p->mutex);
1122     }
1123     for (i = 0; i < migrate_multifd_channels(); i++) {
1124         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1125
1126         trace_multifd_recv_sync_main_signal(p->id);
1127
1128         qemu_sem_post(&p->sem_sync);
1129     }
1130     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1131 }
1132
1133 static void *multifd_recv_thread(void *opaque)
1134 {
1135     MultiFDRecvParams *p = opaque;
1136     Error *local_err = NULL;
1137     int ret;
1138
1139     trace_multifd_recv_thread_start(p->id);
1140
1141     while (true) {
1142         uint32_t used;
1143         uint32_t flags;
1144
1145         /* ToDo: recv packet here */
1146
1147         qemu_mutex_lock(&p->mutex);
1148         ret = multifd_recv_unfill_packet(p, &local_err);
1149         if (ret) {
1150             qemu_mutex_unlock(&p->mutex);
1151             break;
1152         }
1153
1154         used = p->pages->used;
1155         flags = p->flags;
1156         trace_multifd_recv(p->id, p->packet_num, used, flags);
1157         p->pending_job = false;
1158         p->num_packets++;
1159         p->num_pages += used;
1160         qemu_mutex_unlock(&p->mutex);
1161
1162         if (flags & MULTIFD_FLAG_SYNC) {
1163             qemu_sem_post(&multifd_recv_state->sem_sync);
1164             qemu_sem_wait(&p->sem_sync);
1165         }
1166     }
1167
1168     if (local_err) {
1169         multifd_recv_terminate_threads(local_err);
1170     }
1171     qemu_mutex_lock(&p->mutex);
1172     p->running = false;
1173     qemu_mutex_unlock(&p->mutex);
1174
1175     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1176
1177     return NULL;
1178 }
1179
1180 int multifd_load_setup(void)
1181 {
1182     int thread_count;
1183     uint32_t page_count = migrate_multifd_page_count();
1184     uint8_t i;
1185
1186     if (!migrate_use_multifd()) {
1187         return 0;
1188     }
1189     thread_count = migrate_multifd_channels();
1190     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1191     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1192     atomic_set(&multifd_recv_state->count, 0);
1193     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1194
1195     for (i = 0; i < thread_count; i++) {
1196         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1197
1198         qemu_mutex_init(&p->mutex);
1199         qemu_sem_init(&p->sem, 0);
1200         qemu_sem_init(&p->sem_sync, 0);
1201         p->quit = false;
1202         p->pending_job = false;
1203         p->id = i;
1204         p->pages = multifd_pages_init(page_count);
1205         p->packet_len = sizeof(MultiFDPacket_t)
1206                       + sizeof(ram_addr_t) * page_count;
1207         p->packet = g_malloc0(p->packet_len);
1208         p->name = g_strdup_printf("multifdrecv_%d", i);
1209     }
1210     return 0;
1211 }
1212
1213 bool multifd_recv_all_channels_created(void)
1214 {
1215     int thread_count = migrate_multifd_channels();
1216
1217     if (!migrate_use_multifd()) {
1218         return true;
1219     }
1220
1221     return thread_count == atomic_read(&multifd_recv_state->count);
1222 }
1223
1224 void multifd_recv_new_channel(QIOChannel *ioc)
1225 {
1226     MultiFDRecvParams *p;
1227     Error *local_err = NULL;
1228     int id;
1229
1230     id = multifd_recv_initial_packet(ioc, &local_err);
1231     if (id < 0) {
1232         multifd_recv_terminate_threads(local_err);
1233         return;
1234     }
1235
1236     p = &multifd_recv_state->params[id];
1237     if (p->c != NULL) {
1238         error_setg(&local_err, "multifd: received id '%d' already setup'",
1239                    id);
1240         multifd_recv_terminate_threads(local_err);
1241         return;
1242     }
1243     p->c = ioc;
1244     object_ref(OBJECT(ioc));
1245     /* initial packet */
1246     p->num_packets = 1;
1247
1248     p->running = true;
1249     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1250                        QEMU_THREAD_JOINABLE);
1251     atomic_inc(&multifd_recv_state->count);
1252     if (multifd_recv_state->count == migrate_multifd_channels()) {
1253         migration_incoming_process();
1254     }
1255 }
1256
1257 /**
1258  * save_page_header: write page header to wire
1259  *
1260  * If this is the 1st block, it also writes the block identification
1261  *
1262  * Returns the number of bytes written
1263  *
1264  * @f: QEMUFile where to send the data
1265  * @block: block that contains the page we want to send
1266  * @offset: offset inside the block for the page
1267  *          in the lower bits, it contains flags
1268  */
1269 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1270                                ram_addr_t offset)
1271 {
1272     size_t size, len;
1273
1274     if (block == rs->last_sent_block) {
1275         offset |= RAM_SAVE_FLAG_CONTINUE;
1276     }
1277     qemu_put_be64(f, offset);
1278     size = 8;
1279
1280     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1281         len = strlen(block->idstr);
1282         qemu_put_byte(f, len);
1283         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1284         size += 1 + len;
1285         rs->last_sent_block = block;
1286     }
1287     return size;
1288 }
1289
1290 /**
1291  * mig_throttle_guest_down: throotle down the guest
1292  *
1293  * Reduce amount of guest cpu execution to hopefully slow down memory
1294  * writes. If guest dirty memory rate is reduced below the rate at
1295  * which we can transfer pages to the destination then we should be
1296  * able to complete migration. Some workloads dirty memory way too
1297  * fast and will not effectively converge, even with auto-converge.
1298  */
1299 static void mig_throttle_guest_down(void)
1300 {
1301     MigrationState *s = migrate_get_current();
1302     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1303     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1304
1305     /* We have not started throttling yet. Let's start it. */
1306     if (!cpu_throttle_active()) {
1307         cpu_throttle_set(pct_initial);
1308     } else {
1309         /* Throttling already on, just increase the rate */
1310         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
1311     }
1312 }
1313
1314 /**
1315  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1316  *
1317  * @rs: current RAM state
1318  * @current_addr: address for the zero page
1319  *
1320  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1321  * The important thing is that a stale (not-yet-0'd) page be replaced
1322  * by the new data.
1323  * As a bonus, if the page wasn't in the cache it gets added so that
1324  * when a small write is made into the 0'd page it gets XBZRLE sent.
1325  */
1326 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1327 {
1328     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1329         return;
1330     }
1331
1332     /* We don't care if this fails to allocate a new cache page
1333      * as long as it updated an old one */
1334     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1335                  ram_counters.dirty_sync_count);
1336 }
1337
1338 #define ENCODING_FLAG_XBZRLE 0x1
1339
1340 /**
1341  * save_xbzrle_page: compress and send current page
1342  *
1343  * Returns: 1 means that we wrote the page
1344  *          0 means that page is identical to the one already sent
1345  *          -1 means that xbzrle would be longer than normal
1346  *
1347  * @rs: current RAM state
1348  * @current_data: pointer to the address of the page contents
1349  * @current_addr: addr of the page
1350  * @block: block that contains the page we want to send
1351  * @offset: offset inside the block for the page
1352  * @last_stage: if we are at the completion stage
1353  */
1354 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1355                             ram_addr_t current_addr, RAMBlock *block,
1356                             ram_addr_t offset, bool last_stage)
1357 {
1358     int encoded_len = 0, bytes_xbzrle;
1359     uint8_t *prev_cached_page;
1360
1361     if (!cache_is_cached(XBZRLE.cache, current_addr,
1362                          ram_counters.dirty_sync_count)) {
1363         xbzrle_counters.cache_miss++;
1364         if (!last_stage) {
1365             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1366                              ram_counters.dirty_sync_count) == -1) {
1367                 return -1;
1368             } else {
1369                 /* update *current_data when the page has been
1370                    inserted into cache */
1371                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1372             }
1373         }
1374         return -1;
1375     }
1376
1377     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1378
1379     /* save current buffer into memory */
1380     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1381
1382     /* XBZRLE encoding (if there is no overflow) */
1383     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1384                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1385                                        TARGET_PAGE_SIZE);
1386     if (encoded_len == 0) {
1387         trace_save_xbzrle_page_skipping();
1388         return 0;
1389     } else if (encoded_len == -1) {
1390         trace_save_xbzrle_page_overflow();
1391         xbzrle_counters.overflow++;
1392         /* update data in the cache */
1393         if (!last_stage) {
1394             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1395             *current_data = prev_cached_page;
1396         }
1397         return -1;
1398     }
1399
1400     /* we need to update the data in the cache, in order to get the same data */
1401     if (!last_stage) {
1402         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1403     }
1404
1405     /* Send XBZRLE based compressed page */
1406     bytes_xbzrle = save_page_header(rs, rs->f, block,
1407                                     offset | RAM_SAVE_FLAG_XBZRLE);
1408     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1409     qemu_put_be16(rs->f, encoded_len);
1410     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1411     bytes_xbzrle += encoded_len + 1 + 2;
1412     xbzrle_counters.pages++;
1413     xbzrle_counters.bytes += bytes_xbzrle;
1414     ram_counters.transferred += bytes_xbzrle;
1415
1416     return 1;
1417 }
1418
1419 /**
1420  * migration_bitmap_find_dirty: find the next dirty page from start
1421  *
1422  * Called with rcu_read_lock() to protect migration_bitmap
1423  *
1424  * Returns the byte offset within memory region of the start of a dirty page
1425  *
1426  * @rs: current RAM state
1427  * @rb: RAMBlock where to search for dirty pages
1428  * @start: page where we start the search
1429  */
1430 static inline
1431 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1432                                           unsigned long start)
1433 {
1434     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1435     unsigned long *bitmap = rb->bmap;
1436     unsigned long next;
1437
1438     if (!qemu_ram_is_migratable(rb)) {
1439         return size;
1440     }
1441
1442     if (rs->ram_bulk_stage && start > 0) {
1443         next = start + 1;
1444     } else {
1445         next = find_next_bit(bitmap, size, start);
1446     }
1447
1448     return next;
1449 }
1450
1451 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1452                                                 RAMBlock *rb,
1453                                                 unsigned long page)
1454 {
1455     bool ret;
1456
1457     ret = test_and_clear_bit(page, rb->bmap);
1458
1459     if (ret) {
1460         rs->migration_dirty_pages--;
1461     }
1462     return ret;
1463 }
1464
1465 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1466                                         ram_addr_t start, ram_addr_t length)
1467 {
1468     rs->migration_dirty_pages +=
1469         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1470                                               &rs->num_dirty_pages_period);
1471 }
1472
1473 /**
1474  * ram_pagesize_summary: calculate all the pagesizes of a VM
1475  *
1476  * Returns a summary bitmap of the page sizes of all RAMBlocks
1477  *
1478  * For VMs with just normal pages this is equivalent to the host page
1479  * size. If it's got some huge pages then it's the OR of all the
1480  * different page sizes.
1481  */
1482 uint64_t ram_pagesize_summary(void)
1483 {
1484     RAMBlock *block;
1485     uint64_t summary = 0;
1486
1487     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1488         summary |= block->page_size;
1489     }
1490
1491     return summary;
1492 }
1493
1494 static void migration_update_rates(RAMState *rs, int64_t end_time)
1495 {
1496     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1497
1498     /* calculate period counters */
1499     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1500                 / (end_time - rs->time_last_bitmap_sync);
1501
1502     if (!iter_count) {
1503         return;
1504     }
1505
1506     if (migrate_use_xbzrle()) {
1507         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1508             rs->xbzrle_cache_miss_prev) / iter_count;
1509         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1510     }
1511 }
1512
1513 static void migration_bitmap_sync(RAMState *rs)
1514 {
1515     RAMBlock *block;
1516     int64_t end_time;
1517     uint64_t bytes_xfer_now;
1518
1519     ram_counters.dirty_sync_count++;
1520
1521     if (!rs->time_last_bitmap_sync) {
1522         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1523     }
1524
1525     trace_migration_bitmap_sync_start();
1526     memory_global_dirty_log_sync();
1527
1528     qemu_mutex_lock(&rs->bitmap_mutex);
1529     rcu_read_lock();
1530     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1531         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1532     }
1533     ram_counters.remaining = ram_bytes_remaining();
1534     rcu_read_unlock();
1535     qemu_mutex_unlock(&rs->bitmap_mutex);
1536
1537     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1538
1539     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1540
1541     /* more than 1 second = 1000 millisecons */
1542     if (end_time > rs->time_last_bitmap_sync + 1000) {
1543         bytes_xfer_now = ram_counters.transferred;
1544
1545         /* During block migration the auto-converge logic incorrectly detects
1546          * that ram migration makes no progress. Avoid this by disabling the
1547          * throttling logic during the bulk phase of block migration. */
1548         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1549             /* The following detection logic can be refined later. For now:
1550                Check to see if the dirtied bytes is 50% more than the approx.
1551                amount of bytes that just got transferred since the last time we
1552                were in this routine. If that happens twice, start or increase
1553                throttling */
1554
1555             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1556                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1557                 (++rs->dirty_rate_high_cnt >= 2)) {
1558                     trace_migration_throttle();
1559                     rs->dirty_rate_high_cnt = 0;
1560                     mig_throttle_guest_down();
1561             }
1562         }
1563
1564         migration_update_rates(rs, end_time);
1565
1566         rs->iterations_prev = rs->iterations;
1567
1568         /* reset period counters */
1569         rs->time_last_bitmap_sync = end_time;
1570         rs->num_dirty_pages_period = 0;
1571         rs->bytes_xfer_prev = bytes_xfer_now;
1572     }
1573     if (migrate_use_events()) {
1574         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1575     }
1576 }
1577
1578 /**
1579  * save_zero_page: send the zero page to the stream
1580  *
1581  * Returns the number of pages written.
1582  *
1583  * @rs: current RAM state
1584  * @block: block that contains the page we want to send
1585  * @offset: offset inside the block for the page
1586  */
1587 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1588 {
1589     uint8_t *p = block->host + offset;
1590     int pages = -1;
1591
1592     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1593         ram_counters.duplicate++;
1594         ram_counters.transferred +=
1595             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1596         qemu_put_byte(rs->f, 0);
1597         ram_counters.transferred += 1;
1598         pages = 1;
1599     }
1600
1601     return pages;
1602 }
1603
1604 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1605 {
1606     if (!migrate_release_ram() || !migration_in_postcopy()) {
1607         return;
1608     }
1609
1610     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1611 }
1612
1613 /*
1614  * @pages: the number of pages written by the control path,
1615  *        < 0 - error
1616  *        > 0 - number of pages written
1617  *
1618  * Return true if the pages has been saved, otherwise false is returned.
1619  */
1620 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1621                               int *pages)
1622 {
1623     uint64_t bytes_xmit = 0;
1624     int ret;
1625
1626     *pages = -1;
1627     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1628                                 &bytes_xmit);
1629     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1630         return false;
1631     }
1632
1633     if (bytes_xmit) {
1634         ram_counters.transferred += bytes_xmit;
1635         *pages = 1;
1636     }
1637
1638     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1639         return true;
1640     }
1641
1642     if (bytes_xmit > 0) {
1643         ram_counters.normal++;
1644     } else if (bytes_xmit == 0) {
1645         ram_counters.duplicate++;
1646     }
1647
1648     return true;
1649 }
1650
1651 /*
1652  * directly send the page to the stream
1653  *
1654  * Returns the number of pages written.
1655  *
1656  * @rs: current RAM state
1657  * @block: block that contains the page we want to send
1658  * @offset: offset inside the block for the page
1659  * @buf: the page to be sent
1660  * @async: send to page asyncly
1661  */
1662 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1663                             uint8_t *buf, bool async)
1664 {
1665     ram_counters.transferred += save_page_header(rs, rs->f, block,
1666                                                  offset | RAM_SAVE_FLAG_PAGE);
1667     if (async) {
1668         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1669                               migrate_release_ram() &
1670                               migration_in_postcopy());
1671     } else {
1672         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1673     }
1674     ram_counters.transferred += TARGET_PAGE_SIZE;
1675     ram_counters.normal++;
1676     return 1;
1677 }
1678
1679 /**
1680  * ram_save_page: send the given page to the stream
1681  *
1682  * Returns the number of pages written.
1683  *          < 0 - error
1684  *          >=0 - Number of pages written - this might legally be 0
1685  *                if xbzrle noticed the page was the same.
1686  *
1687  * @rs: current RAM state
1688  * @block: block that contains the page we want to send
1689  * @offset: offset inside the block for the page
1690  * @last_stage: if we are at the completion stage
1691  */
1692 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1693 {
1694     int pages = -1;
1695     uint8_t *p;
1696     bool send_async = true;
1697     RAMBlock *block = pss->block;
1698     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1699     ram_addr_t current_addr = block->offset + offset;
1700
1701     p = block->host + offset;
1702     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1703
1704     XBZRLE_cache_lock();
1705     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1706         migrate_use_xbzrle()) {
1707         pages = save_xbzrle_page(rs, &p, current_addr, block,
1708                                  offset, last_stage);
1709         if (!last_stage) {
1710             /* Can't send this cached data async, since the cache page
1711              * might get updated before it gets to the wire
1712              */
1713             send_async = false;
1714         }
1715     }
1716
1717     /* XBZRLE overflow or normal page */
1718     if (pages == -1) {
1719         pages = save_normal_page(rs, block, offset, p, send_async);
1720     }
1721
1722     XBZRLE_cache_unlock();
1723
1724     return pages;
1725 }
1726
1727 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1728                                 ram_addr_t offset, uint8_t *source_buf)
1729 {
1730     RAMState *rs = ram_state;
1731     int bytes_sent, blen;
1732     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1733
1734     bytes_sent = save_page_header(rs, f, block, offset |
1735                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1736
1737     /*
1738      * copy it to a internal buffer to avoid it being modified by VM
1739      * so that we can catch up the error during compression and
1740      * decompression
1741      */
1742     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1743     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1744     if (blen < 0) {
1745         bytes_sent = 0;
1746         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1747         error_report("compressed data failed!");
1748     } else {
1749         bytes_sent += blen;
1750         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1751     }
1752
1753     return bytes_sent;
1754 }
1755
1756 static void flush_compressed_data(RAMState *rs)
1757 {
1758     int idx, len, thread_count;
1759
1760     if (!migrate_use_compression()) {
1761         return;
1762     }
1763     thread_count = migrate_compress_threads();
1764
1765     qemu_mutex_lock(&comp_done_lock);
1766     for (idx = 0; idx < thread_count; idx++) {
1767         while (!comp_param[idx].done) {
1768             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1769         }
1770     }
1771     qemu_mutex_unlock(&comp_done_lock);
1772
1773     for (idx = 0; idx < thread_count; idx++) {
1774         qemu_mutex_lock(&comp_param[idx].mutex);
1775         if (!comp_param[idx].quit) {
1776             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1777             ram_counters.transferred += len;
1778         }
1779         qemu_mutex_unlock(&comp_param[idx].mutex);
1780     }
1781 }
1782
1783 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1784                                        ram_addr_t offset)
1785 {
1786     param->block = block;
1787     param->offset = offset;
1788 }
1789
1790 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1791                                            ram_addr_t offset)
1792 {
1793     int idx, thread_count, bytes_xmit = -1, pages = -1;
1794
1795     thread_count = migrate_compress_threads();
1796     qemu_mutex_lock(&comp_done_lock);
1797     while (true) {
1798         for (idx = 0; idx < thread_count; idx++) {
1799             if (comp_param[idx].done) {
1800                 comp_param[idx].done = false;
1801                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1802                 qemu_mutex_lock(&comp_param[idx].mutex);
1803                 set_compress_params(&comp_param[idx], block, offset);
1804                 qemu_cond_signal(&comp_param[idx].cond);
1805                 qemu_mutex_unlock(&comp_param[idx].mutex);
1806                 pages = 1;
1807                 ram_counters.normal++;
1808                 ram_counters.transferred += bytes_xmit;
1809                 break;
1810             }
1811         }
1812         if (pages > 0) {
1813             break;
1814         } else {
1815             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1816         }
1817     }
1818     qemu_mutex_unlock(&comp_done_lock);
1819
1820     return pages;
1821 }
1822
1823 /**
1824  * find_dirty_block: find the next dirty page and update any state
1825  * associated with the search process.
1826  *
1827  * Returns if a page is found
1828  *
1829  * @rs: current RAM state
1830  * @pss: data about the state of the current dirty page scan
1831  * @again: set to false if the search has scanned the whole of RAM
1832  */
1833 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1834 {
1835     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1836     if (pss->complete_round && pss->block == rs->last_seen_block &&
1837         pss->page >= rs->last_page) {
1838         /*
1839          * We've been once around the RAM and haven't found anything.
1840          * Give up.
1841          */
1842         *again = false;
1843         return false;
1844     }
1845     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1846         /* Didn't find anything in this RAM Block */
1847         pss->page = 0;
1848         pss->block = QLIST_NEXT_RCU(pss->block, next);
1849         if (!pss->block) {
1850             /* Hit the end of the list */
1851             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1852             /* Flag that we've looped */
1853             pss->complete_round = true;
1854             rs->ram_bulk_stage = false;
1855             if (migrate_use_xbzrle()) {
1856                 /* If xbzrle is on, stop using the data compression at this
1857                  * point. In theory, xbzrle can do better than compression.
1858                  */
1859                 flush_compressed_data(rs);
1860             }
1861         }
1862         /* Didn't find anything this time, but try again on the new block */
1863         *again = true;
1864         return false;
1865     } else {
1866         /* Can go around again, but... */
1867         *again = true;
1868         /* We've found something so probably don't need to */
1869         return true;
1870     }
1871 }
1872
1873 /**
1874  * unqueue_page: gets a page of the queue
1875  *
1876  * Helper for 'get_queued_page' - gets a page off the queue
1877  *
1878  * Returns the block of the page (or NULL if none available)
1879  *
1880  * @rs: current RAM state
1881  * @offset: used to return the offset within the RAMBlock
1882  */
1883 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1884 {
1885     RAMBlock *block = NULL;
1886
1887     qemu_mutex_lock(&rs->src_page_req_mutex);
1888     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1889         struct RAMSrcPageRequest *entry =
1890                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1891         block = entry->rb;
1892         *offset = entry->offset;
1893
1894         if (entry->len > TARGET_PAGE_SIZE) {
1895             entry->len -= TARGET_PAGE_SIZE;
1896             entry->offset += TARGET_PAGE_SIZE;
1897         } else {
1898             memory_region_unref(block->mr);
1899             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1900             g_free(entry);
1901             migration_consume_urgent_request();
1902         }
1903     }
1904     qemu_mutex_unlock(&rs->src_page_req_mutex);
1905
1906     return block;
1907 }
1908
1909 /**
1910  * get_queued_page: unqueue a page from the postocpy requests
1911  *
1912  * Skips pages that are already sent (!dirty)
1913  *
1914  * Returns if a queued page is found
1915  *
1916  * @rs: current RAM state
1917  * @pss: data about the state of the current dirty page scan
1918  */
1919 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1920 {
1921     RAMBlock  *block;
1922     ram_addr_t offset;
1923     bool dirty;
1924
1925     do {
1926         block = unqueue_page(rs, &offset);
1927         /*
1928          * We're sending this page, and since it's postcopy nothing else
1929          * will dirty it, and we must make sure it doesn't get sent again
1930          * even if this queue request was received after the background
1931          * search already sent it.
1932          */
1933         if (block) {
1934             unsigned long page;
1935
1936             page = offset >> TARGET_PAGE_BITS;
1937             dirty = test_bit(page, block->bmap);
1938             if (!dirty) {
1939                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1940                        page, test_bit(page, block->unsentmap));
1941             } else {
1942                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1943             }
1944         }
1945
1946     } while (block && !dirty);
1947
1948     if (block) {
1949         /*
1950          * As soon as we start servicing pages out of order, then we have
1951          * to kill the bulk stage, since the bulk stage assumes
1952          * in (migration_bitmap_find_and_reset_dirty) that every page is
1953          * dirty, that's no longer true.
1954          */
1955         rs->ram_bulk_stage = false;
1956
1957         /*
1958          * We want the background search to continue from the queued page
1959          * since the guest is likely to want other pages near to the page
1960          * it just requested.
1961          */
1962         pss->block = block;
1963         pss->page = offset >> TARGET_PAGE_BITS;
1964     }
1965
1966     return !!block;
1967 }
1968
1969 /**
1970  * migration_page_queue_free: drop any remaining pages in the ram
1971  * request queue
1972  *
1973  * It should be empty at the end anyway, but in error cases there may
1974  * be some left.  in case that there is any page left, we drop it.
1975  *
1976  */
1977 static void migration_page_queue_free(RAMState *rs)
1978 {
1979     struct RAMSrcPageRequest *mspr, *next_mspr;
1980     /* This queue generally should be empty - but in the case of a failed
1981      * migration might have some droppings in.
1982      */
1983     rcu_read_lock();
1984     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1985         memory_region_unref(mspr->rb->mr);
1986         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1987         g_free(mspr);
1988     }
1989     rcu_read_unlock();
1990 }
1991
1992 /**
1993  * ram_save_queue_pages: queue the page for transmission
1994  *
1995  * A request from postcopy destination for example.
1996  *
1997  * Returns zero on success or negative on error
1998  *
1999  * @rbname: Name of the RAMBLock of the request. NULL means the
2000  *          same that last one.
2001  * @start: starting address from the start of the RAMBlock
2002  * @len: length (in bytes) to send
2003  */
2004 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2005 {
2006     RAMBlock *ramblock;
2007     RAMState *rs = ram_state;
2008
2009     ram_counters.postcopy_requests++;
2010     rcu_read_lock();
2011     if (!rbname) {
2012         /* Reuse last RAMBlock */
2013         ramblock = rs->last_req_rb;
2014
2015         if (!ramblock) {
2016             /*
2017              * Shouldn't happen, we can't reuse the last RAMBlock if
2018              * it's the 1st request.
2019              */
2020             error_report("ram_save_queue_pages no previous block");
2021             goto err;
2022         }
2023     } else {
2024         ramblock = qemu_ram_block_by_name(rbname);
2025
2026         if (!ramblock) {
2027             /* We shouldn't be asked for a non-existent RAMBlock */
2028             error_report("ram_save_queue_pages no block '%s'", rbname);
2029             goto err;
2030         }
2031         rs->last_req_rb = ramblock;
2032     }
2033     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2034     if (start+len > ramblock->used_length) {
2035         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2036                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2037                      __func__, start, len, ramblock->used_length);
2038         goto err;
2039     }
2040
2041     struct RAMSrcPageRequest *new_entry =
2042         g_malloc0(sizeof(struct RAMSrcPageRequest));
2043     new_entry->rb = ramblock;
2044     new_entry->offset = start;
2045     new_entry->len = len;
2046
2047     memory_region_ref(ramblock->mr);
2048     qemu_mutex_lock(&rs->src_page_req_mutex);
2049     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2050     migration_make_urgent_request();
2051     qemu_mutex_unlock(&rs->src_page_req_mutex);
2052     rcu_read_unlock();
2053
2054     return 0;
2055
2056 err:
2057     rcu_read_unlock();
2058     return -1;
2059 }
2060
2061 static bool save_page_use_compression(RAMState *rs)
2062 {
2063     if (!migrate_use_compression()) {
2064         return false;
2065     }
2066
2067     /*
2068      * If xbzrle is on, stop using the data compression after first
2069      * round of migration even if compression is enabled. In theory,
2070      * xbzrle can do better than compression.
2071      */
2072     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2073         return true;
2074     }
2075
2076     return false;
2077 }
2078
2079 /**
2080  * ram_save_target_page: save one target page
2081  *
2082  * Returns the number of pages written
2083  *
2084  * @rs: current RAM state
2085  * @pss: data about the page we want to send
2086  * @last_stage: if we are at the completion stage
2087  */
2088 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2089                                 bool last_stage)
2090 {
2091     RAMBlock *block = pss->block;
2092     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2093     int res;
2094
2095     if (control_save_page(rs, block, offset, &res)) {
2096         return res;
2097     }
2098
2099     /*
2100      * When starting the process of a new block, the first page of
2101      * the block should be sent out before other pages in the same
2102      * block, and all the pages in last block should have been sent
2103      * out, keeping this order is important, because the 'cont' flag
2104      * is used to avoid resending the block name.
2105      */
2106     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2107             flush_compressed_data(rs);
2108     }
2109
2110     res = save_zero_page(rs, block, offset);
2111     if (res > 0) {
2112         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2113          * page would be stale
2114          */
2115         if (!save_page_use_compression(rs)) {
2116             XBZRLE_cache_lock();
2117             xbzrle_cache_zero_page(rs, block->offset + offset);
2118             XBZRLE_cache_unlock();
2119         }
2120         ram_release_pages(block->idstr, offset, res);
2121         return res;
2122     }
2123
2124     /*
2125      * Make sure the first page is sent out before other pages.
2126      *
2127      * we post it as normal page as compression will take much
2128      * CPU resource.
2129      */
2130     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2131         return compress_page_with_multi_thread(rs, block, offset);
2132     }
2133
2134     return ram_save_page(rs, pss, last_stage);
2135 }
2136
2137 /**
2138  * ram_save_host_page: save a whole host page
2139  *
2140  * Starting at *offset send pages up to the end of the current host
2141  * page. It's valid for the initial offset to point into the middle of
2142  * a host page in which case the remainder of the hostpage is sent.
2143  * Only dirty target pages are sent. Note that the host page size may
2144  * be a huge page for this block.
2145  * The saving stops at the boundary of the used_length of the block
2146  * if the RAMBlock isn't a multiple of the host page size.
2147  *
2148  * Returns the number of pages written or negative on error
2149  *
2150  * @rs: current RAM state
2151  * @ms: current migration state
2152  * @pss: data about the page we want to send
2153  * @last_stage: if we are at the completion stage
2154  */
2155 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2156                               bool last_stage)
2157 {
2158     int tmppages, pages = 0;
2159     size_t pagesize_bits =
2160         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2161
2162     if (!qemu_ram_is_migratable(pss->block)) {
2163         error_report("block %s should not be migrated !", pss->block->idstr);
2164         return 0;
2165     }
2166
2167     do {
2168         /* Check the pages is dirty and if it is send it */
2169         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2170             pss->page++;
2171             continue;
2172         }
2173
2174         tmppages = ram_save_target_page(rs, pss, last_stage);
2175         if (tmppages < 0) {
2176             return tmppages;
2177         }
2178
2179         pages += tmppages;
2180         if (pss->block->unsentmap) {
2181             clear_bit(pss->page, pss->block->unsentmap);
2182         }
2183
2184         pss->page++;
2185     } while ((pss->page & (pagesize_bits - 1)) &&
2186              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2187
2188     /* The offset we leave with is the last one we looked at */
2189     pss->page--;
2190     return pages;
2191 }
2192
2193 /**
2194  * ram_find_and_save_block: finds a dirty page and sends it to f
2195  *
2196  * Called within an RCU critical section.
2197  *
2198  * Returns the number of pages written where zero means no dirty pages
2199  *
2200  * @rs: current RAM state
2201  * @last_stage: if we are at the completion stage
2202  *
2203  * On systems where host-page-size > target-page-size it will send all the
2204  * pages in a host page that are dirty.
2205  */
2206
2207 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2208 {
2209     PageSearchStatus pss;
2210     int pages = 0;
2211     bool again, found;
2212
2213     /* No dirty page as there is zero RAM */
2214     if (!ram_bytes_total()) {
2215         return pages;
2216     }
2217
2218     pss.block = rs->last_seen_block;
2219     pss.page = rs->last_page;
2220     pss.complete_round = false;
2221
2222     if (!pss.block) {
2223         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2224     }
2225
2226     do {
2227         again = true;
2228         found = get_queued_page(rs, &pss);
2229
2230         if (!found) {
2231             /* priority queue empty, so just search for something dirty */
2232             found = find_dirty_block(rs, &pss, &again);
2233         }
2234
2235         if (found) {
2236             pages = ram_save_host_page(rs, &pss, last_stage);
2237         }
2238     } while (!pages && again);
2239
2240     rs->last_seen_block = pss.block;
2241     rs->last_page = pss.page;
2242
2243     return pages;
2244 }
2245
2246 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2247 {
2248     uint64_t pages = size / TARGET_PAGE_SIZE;
2249
2250     if (zero) {
2251         ram_counters.duplicate += pages;
2252     } else {
2253         ram_counters.normal += pages;
2254         ram_counters.transferred += size;
2255         qemu_update_position(f, size);
2256     }
2257 }
2258
2259 uint64_t ram_bytes_total(void)
2260 {
2261     RAMBlock *block;
2262     uint64_t total = 0;
2263
2264     rcu_read_lock();
2265     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2266         total += block->used_length;
2267     }
2268     rcu_read_unlock();
2269     return total;
2270 }
2271
2272 static void xbzrle_load_setup(void)
2273 {
2274     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2275 }
2276
2277 static void xbzrle_load_cleanup(void)
2278 {
2279     g_free(XBZRLE.decoded_buf);
2280     XBZRLE.decoded_buf = NULL;
2281 }
2282
2283 static void ram_state_cleanup(RAMState **rsp)
2284 {
2285     if (*rsp) {
2286         migration_page_queue_free(*rsp);
2287         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2288         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2289         g_free(*rsp);
2290         *rsp = NULL;
2291     }
2292 }
2293
2294 static void xbzrle_cleanup(void)
2295 {
2296     XBZRLE_cache_lock();
2297     if (XBZRLE.cache) {
2298         cache_fini(XBZRLE.cache);
2299         g_free(XBZRLE.encoded_buf);
2300         g_free(XBZRLE.current_buf);
2301         g_free(XBZRLE.zero_target_page);
2302         XBZRLE.cache = NULL;
2303         XBZRLE.encoded_buf = NULL;
2304         XBZRLE.current_buf = NULL;
2305         XBZRLE.zero_target_page = NULL;
2306     }
2307     XBZRLE_cache_unlock();
2308 }
2309
2310 static void ram_save_cleanup(void *opaque)
2311 {
2312     RAMState **rsp = opaque;
2313     RAMBlock *block;
2314
2315     /* caller have hold iothread lock or is in a bh, so there is
2316      * no writing race against this migration_bitmap
2317      */
2318     memory_global_dirty_log_stop();
2319
2320     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2321         g_free(block->bmap);
2322         block->bmap = NULL;
2323         g_free(block->unsentmap);
2324         block->unsentmap = NULL;
2325     }
2326
2327     xbzrle_cleanup();
2328     compress_threads_save_cleanup();
2329     ram_state_cleanup(rsp);
2330 }
2331
2332 static void ram_state_reset(RAMState *rs)
2333 {
2334     rs->last_seen_block = NULL;
2335     rs->last_sent_block = NULL;
2336     rs->last_page = 0;
2337     rs->last_version = ram_list.version;
2338     rs->ram_bulk_stage = true;
2339 }
2340
2341 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2342
2343 /*
2344  * 'expected' is the value you expect the bitmap mostly to be full
2345  * of; it won't bother printing lines that are all this value.
2346  * If 'todump' is null the migration bitmap is dumped.
2347  */
2348 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2349                            unsigned long pages)
2350 {
2351     int64_t cur;
2352     int64_t linelen = 128;
2353     char linebuf[129];
2354
2355     for (cur = 0; cur < pages; cur += linelen) {
2356         int64_t curb;
2357         bool found = false;
2358         /*
2359          * Last line; catch the case where the line length
2360          * is longer than remaining ram
2361          */
2362         if (cur + linelen > pages) {
2363             linelen = pages - cur;
2364         }
2365         for (curb = 0; curb < linelen; curb++) {
2366             bool thisbit = test_bit(cur + curb, todump);
2367             linebuf[curb] = thisbit ? '1' : '.';
2368             found = found || (thisbit != expected);
2369         }
2370         if (found) {
2371             linebuf[curb] = '\0';
2372             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2373         }
2374     }
2375 }
2376
2377 /* **** functions for postcopy ***** */
2378
2379 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2380 {
2381     struct RAMBlock *block;
2382
2383     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2384         unsigned long *bitmap = block->bmap;
2385         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2386         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2387
2388         while (run_start < range) {
2389             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2390             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2391                               (run_end - run_start) << TARGET_PAGE_BITS);
2392             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2393         }
2394     }
2395 }
2396
2397 /**
2398  * postcopy_send_discard_bm_ram: discard a RAMBlock
2399  *
2400  * Returns zero on success
2401  *
2402  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2403  * Note: At this point the 'unsentmap' is the processed bitmap combined
2404  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2405  *
2406  * @ms: current migration state
2407  * @pds: state for postcopy
2408  * @start: RAMBlock starting page
2409  * @length: RAMBlock size
2410  */
2411 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2412                                         PostcopyDiscardState *pds,
2413                                         RAMBlock *block)
2414 {
2415     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2416     unsigned long current;
2417     unsigned long *unsentmap = block->unsentmap;
2418
2419     for (current = 0; current < end; ) {
2420         unsigned long one = find_next_bit(unsentmap, end, current);
2421
2422         if (one <= end) {
2423             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2424             unsigned long discard_length;
2425
2426             if (zero >= end) {
2427                 discard_length = end - one;
2428             } else {
2429                 discard_length = zero - one;
2430             }
2431             if (discard_length) {
2432                 postcopy_discard_send_range(ms, pds, one, discard_length);
2433             }
2434             current = one + discard_length;
2435         } else {
2436             current = one;
2437         }
2438     }
2439
2440     return 0;
2441 }
2442
2443 /**
2444  * postcopy_each_ram_send_discard: discard all RAMBlocks
2445  *
2446  * Returns 0 for success or negative for error
2447  *
2448  * Utility for the outgoing postcopy code.
2449  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2450  *   passing it bitmap indexes and name.
2451  * (qemu_ram_foreach_block ends up passing unscaled lengths
2452  *  which would mean postcopy code would have to deal with target page)
2453  *
2454  * @ms: current migration state
2455  */
2456 static int postcopy_each_ram_send_discard(MigrationState *ms)
2457 {
2458     struct RAMBlock *block;
2459     int ret;
2460
2461     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2462         PostcopyDiscardState *pds =
2463             postcopy_discard_send_init(ms, block->idstr);
2464
2465         /*
2466          * Postcopy sends chunks of bitmap over the wire, but it
2467          * just needs indexes at this point, avoids it having
2468          * target page specific code.
2469          */
2470         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2471         postcopy_discard_send_finish(ms, pds);
2472         if (ret) {
2473             return ret;
2474         }
2475     }
2476
2477     return 0;
2478 }
2479
2480 /**
2481  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2482  *
2483  * Helper for postcopy_chunk_hostpages; it's called twice to
2484  * canonicalize the two bitmaps, that are similar, but one is
2485  * inverted.
2486  *
2487  * Postcopy requires that all target pages in a hostpage are dirty or
2488  * clean, not a mix.  This function canonicalizes the bitmaps.
2489  *
2490  * @ms: current migration state
2491  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2492  *               otherwise we need to canonicalize partially dirty host pages
2493  * @block: block that contains the page we want to canonicalize
2494  * @pds: state for postcopy
2495  */
2496 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2497                                           RAMBlock *block,
2498                                           PostcopyDiscardState *pds)
2499 {
2500     RAMState *rs = ram_state;
2501     unsigned long *bitmap = block->bmap;
2502     unsigned long *unsentmap = block->unsentmap;
2503     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2504     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2505     unsigned long run_start;
2506
2507     if (block->page_size == TARGET_PAGE_SIZE) {
2508         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2509         return;
2510     }
2511
2512     if (unsent_pass) {
2513         /* Find a sent page */
2514         run_start = find_next_zero_bit(unsentmap, pages, 0);
2515     } else {
2516         /* Find a dirty page */
2517         run_start = find_next_bit(bitmap, pages, 0);
2518     }
2519
2520     while (run_start < pages) {
2521         bool do_fixup = false;
2522         unsigned long fixup_start_addr;
2523         unsigned long host_offset;
2524
2525         /*
2526          * If the start of this run of pages is in the middle of a host
2527          * page, then we need to fixup this host page.
2528          */
2529         host_offset = run_start % host_ratio;
2530         if (host_offset) {
2531             do_fixup = true;
2532             run_start -= host_offset;
2533             fixup_start_addr = run_start;
2534             /* For the next pass */
2535             run_start = run_start + host_ratio;
2536         } else {
2537             /* Find the end of this run */
2538             unsigned long run_end;
2539             if (unsent_pass) {
2540                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2541             } else {
2542                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2543             }
2544             /*
2545              * If the end isn't at the start of a host page, then the
2546              * run doesn't finish at the end of a host page
2547              * and we need to discard.
2548              */
2549             host_offset = run_end % host_ratio;
2550             if (host_offset) {
2551                 do_fixup = true;
2552                 fixup_start_addr = run_end - host_offset;
2553                 /*
2554                  * This host page has gone, the next loop iteration starts
2555                  * from after the fixup
2556                  */
2557                 run_start = fixup_start_addr + host_ratio;
2558             } else {
2559                 /*
2560                  * No discards on this iteration, next loop starts from
2561                  * next sent/dirty page
2562                  */
2563                 run_start = run_end + 1;
2564             }
2565         }
2566
2567         if (do_fixup) {
2568             unsigned long page;
2569
2570             /* Tell the destination to discard this page */
2571             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2572                 /* For the unsent_pass we:
2573                  *     discard partially sent pages
2574                  * For the !unsent_pass (dirty) we:
2575                  *     discard partially dirty pages that were sent
2576                  *     (any partially sent pages were already discarded
2577                  *     by the previous unsent_pass)
2578                  */
2579                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2580                                             host_ratio);
2581             }
2582
2583             /* Clean up the bitmap */
2584             for (page = fixup_start_addr;
2585                  page < fixup_start_addr + host_ratio; page++) {
2586                 /* All pages in this host page are now not sent */
2587                 set_bit(page, unsentmap);
2588
2589                 /*
2590                  * Remark them as dirty, updating the count for any pages
2591                  * that weren't previously dirty.
2592                  */
2593                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2594             }
2595         }
2596
2597         if (unsent_pass) {
2598             /* Find the next sent page for the next iteration */
2599             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2600         } else {
2601             /* Find the next dirty page for the next iteration */
2602             run_start = find_next_bit(bitmap, pages, run_start);
2603         }
2604     }
2605 }
2606
2607 /**
2608  * postcopy_chuck_hostpages: discrad any partially sent host page
2609  *
2610  * Utility for the outgoing postcopy code.
2611  *
2612  * Discard any partially sent host-page size chunks, mark any partially
2613  * dirty host-page size chunks as all dirty.  In this case the host-page
2614  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2615  *
2616  * Returns zero on success
2617  *
2618  * @ms: current migration state
2619  * @block: block we want to work with
2620  */
2621 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2622 {
2623     PostcopyDiscardState *pds =
2624         postcopy_discard_send_init(ms, block->idstr);
2625
2626     /* First pass: Discard all partially sent host pages */
2627     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2628     /*
2629      * Second pass: Ensure that all partially dirty host pages are made
2630      * fully dirty.
2631      */
2632     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2633
2634     postcopy_discard_send_finish(ms, pds);
2635     return 0;
2636 }
2637
2638 /**
2639  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2640  *
2641  * Returns zero on success
2642  *
2643  * Transmit the set of pages to be discarded after precopy to the target
2644  * these are pages that:
2645  *     a) Have been previously transmitted but are now dirty again
2646  *     b) Pages that have never been transmitted, this ensures that
2647  *        any pages on the destination that have been mapped by background
2648  *        tasks get discarded (transparent huge pages is the specific concern)
2649  * Hopefully this is pretty sparse
2650  *
2651  * @ms: current migration state
2652  */
2653 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2654 {
2655     RAMState *rs = ram_state;
2656     RAMBlock *block;
2657     int ret;
2658
2659     rcu_read_lock();
2660
2661     /* This should be our last sync, the src is now paused */
2662     migration_bitmap_sync(rs);
2663
2664     /* Easiest way to make sure we don't resume in the middle of a host-page */
2665     rs->last_seen_block = NULL;
2666     rs->last_sent_block = NULL;
2667     rs->last_page = 0;
2668
2669     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2670         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2671         unsigned long *bitmap = block->bmap;
2672         unsigned long *unsentmap = block->unsentmap;
2673
2674         if (!unsentmap) {
2675             /* We don't have a safe way to resize the sentmap, so
2676              * if the bitmap was resized it will be NULL at this
2677              * point.
2678              */
2679             error_report("migration ram resized during precopy phase");
2680             rcu_read_unlock();
2681             return -EINVAL;
2682         }
2683         /* Deal with TPS != HPS and huge pages */
2684         ret = postcopy_chunk_hostpages(ms, block);
2685         if (ret) {
2686             rcu_read_unlock();
2687             return ret;
2688         }
2689
2690         /*
2691          * Update the unsentmap to be unsentmap = unsentmap | dirty
2692          */
2693         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2694 #ifdef DEBUG_POSTCOPY
2695         ram_debug_dump_bitmap(unsentmap, true, pages);
2696 #endif
2697     }
2698     trace_ram_postcopy_send_discard_bitmap();
2699
2700     ret = postcopy_each_ram_send_discard(ms);
2701     rcu_read_unlock();
2702
2703     return ret;
2704 }
2705
2706 /**
2707  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2708  *
2709  * Returns zero on success
2710  *
2711  * @rbname: name of the RAMBlock of the request. NULL means the
2712  *          same that last one.
2713  * @start: RAMBlock starting page
2714  * @length: RAMBlock size
2715  */
2716 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2717 {
2718     int ret = -1;
2719
2720     trace_ram_discard_range(rbname, start, length);
2721
2722     rcu_read_lock();
2723     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2724
2725     if (!rb) {
2726         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2727         goto err;
2728     }
2729
2730     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2731                  length >> qemu_target_page_bits());
2732     ret = ram_block_discard_range(rb, start, length);
2733
2734 err:
2735     rcu_read_unlock();
2736
2737     return ret;
2738 }
2739
2740 /*
2741  * For every allocation, we will try not to crash the VM if the
2742  * allocation failed.
2743  */
2744 static int xbzrle_init(void)
2745 {
2746     Error *local_err = NULL;
2747
2748     if (!migrate_use_xbzrle()) {
2749         return 0;
2750     }
2751
2752     XBZRLE_cache_lock();
2753
2754     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2755     if (!XBZRLE.zero_target_page) {
2756         error_report("%s: Error allocating zero page", __func__);
2757         goto err_out;
2758     }
2759
2760     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2761                               TARGET_PAGE_SIZE, &local_err);
2762     if (!XBZRLE.cache) {
2763         error_report_err(local_err);
2764         goto free_zero_page;
2765     }
2766
2767     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2768     if (!XBZRLE.encoded_buf) {
2769         error_report("%s: Error allocating encoded_buf", __func__);
2770         goto free_cache;
2771     }
2772
2773     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2774     if (!XBZRLE.current_buf) {
2775         error_report("%s: Error allocating current_buf", __func__);
2776         goto free_encoded_buf;
2777     }
2778
2779     /* We are all good */
2780     XBZRLE_cache_unlock();
2781     return 0;
2782
2783 free_encoded_buf:
2784     g_free(XBZRLE.encoded_buf);
2785     XBZRLE.encoded_buf = NULL;
2786 free_cache:
2787     cache_fini(XBZRLE.cache);
2788     XBZRLE.cache = NULL;
2789 free_zero_page:
2790     g_free(XBZRLE.zero_target_page);
2791     XBZRLE.zero_target_page = NULL;
2792 err_out:
2793     XBZRLE_cache_unlock();
2794     return -ENOMEM;
2795 }
2796
2797 static int ram_state_init(RAMState **rsp)
2798 {
2799     *rsp = g_try_new0(RAMState, 1);
2800
2801     if (!*rsp) {
2802         error_report("%s: Init ramstate fail", __func__);
2803         return -1;
2804     }
2805
2806     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2807     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2808     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2809
2810     /*
2811      * Count the total number of pages used by ram blocks not including any
2812      * gaps due to alignment or unplugs.
2813      */
2814     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2815
2816     ram_state_reset(*rsp);
2817
2818     return 0;
2819 }
2820
2821 static void ram_list_init_bitmaps(void)
2822 {
2823     RAMBlock *block;
2824     unsigned long pages;
2825
2826     /* Skip setting bitmap if there is no RAM */
2827     if (ram_bytes_total()) {
2828         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2829             pages = block->max_length >> TARGET_PAGE_BITS;
2830             block->bmap = bitmap_new(pages);
2831             bitmap_set(block->bmap, 0, pages);
2832             if (migrate_postcopy_ram()) {
2833                 block->unsentmap = bitmap_new(pages);
2834                 bitmap_set(block->unsentmap, 0, pages);
2835             }
2836         }
2837     }
2838 }
2839
2840 static void ram_init_bitmaps(RAMState *rs)
2841 {
2842     /* For memory_global_dirty_log_start below.  */
2843     qemu_mutex_lock_iothread();
2844     qemu_mutex_lock_ramlist();
2845     rcu_read_lock();
2846
2847     ram_list_init_bitmaps();
2848     memory_global_dirty_log_start();
2849     migration_bitmap_sync(rs);
2850
2851     rcu_read_unlock();
2852     qemu_mutex_unlock_ramlist();
2853     qemu_mutex_unlock_iothread();
2854 }
2855
2856 static int ram_init_all(RAMState **rsp)
2857 {
2858     if (ram_state_init(rsp)) {
2859         return -1;
2860     }
2861
2862     if (xbzrle_init()) {
2863         ram_state_cleanup(rsp);
2864         return -1;
2865     }
2866
2867     ram_init_bitmaps(*rsp);
2868
2869     return 0;
2870 }
2871
2872 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2873 {
2874     RAMBlock *block;
2875     uint64_t pages = 0;
2876
2877     /*
2878      * Postcopy is not using xbzrle/compression, so no need for that.
2879      * Also, since source are already halted, we don't need to care
2880      * about dirty page logging as well.
2881      */
2882
2883     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2884         pages += bitmap_count_one(block->bmap,
2885                                   block->used_length >> TARGET_PAGE_BITS);
2886     }
2887
2888     /* This may not be aligned with current bitmaps. Recalculate. */
2889     rs->migration_dirty_pages = pages;
2890
2891     rs->last_seen_block = NULL;
2892     rs->last_sent_block = NULL;
2893     rs->last_page = 0;
2894     rs->last_version = ram_list.version;
2895     /*
2896      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2897      * matter what we have sent.
2898      */
2899     rs->ram_bulk_stage = false;
2900
2901     /* Update RAMState cache of output QEMUFile */
2902     rs->f = out;
2903
2904     trace_ram_state_resume_prepare(pages);
2905 }
2906
2907 /*
2908  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2909  * long-running RCU critical section.  When rcu-reclaims in the code
2910  * start to become numerous it will be necessary to reduce the
2911  * granularity of these critical sections.
2912  */
2913
2914 /**
2915  * ram_save_setup: Setup RAM for migration
2916  *
2917  * Returns zero to indicate success and negative for error
2918  *
2919  * @f: QEMUFile where to send the data
2920  * @opaque: RAMState pointer
2921  */
2922 static int ram_save_setup(QEMUFile *f, void *opaque)
2923 {
2924     RAMState **rsp = opaque;
2925     RAMBlock *block;
2926
2927     if (compress_threads_save_setup()) {
2928         return -1;
2929     }
2930
2931     /* migration has already setup the bitmap, reuse it. */
2932     if (!migration_in_colo_state()) {
2933         if (ram_init_all(rsp) != 0) {
2934             compress_threads_save_cleanup();
2935             return -1;
2936         }
2937     }
2938     (*rsp)->f = f;
2939
2940     rcu_read_lock();
2941
2942     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2943
2944     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2945         qemu_put_byte(f, strlen(block->idstr));
2946         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2947         qemu_put_be64(f, block->used_length);
2948         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2949             qemu_put_be64(f, block->page_size);
2950         }
2951     }
2952
2953     rcu_read_unlock();
2954
2955     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2956     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2957
2958     multifd_send_sync_main();
2959     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2960
2961     return 0;
2962 }
2963
2964 /**
2965  * ram_save_iterate: iterative stage for migration
2966  *
2967  * Returns zero to indicate success and negative for error
2968  *
2969  * @f: QEMUFile where to send the data
2970  * @opaque: RAMState pointer
2971  */
2972 static int ram_save_iterate(QEMUFile *f, void *opaque)
2973 {
2974     RAMState **temp = opaque;
2975     RAMState *rs = *temp;
2976     int ret;
2977     int i;
2978     int64_t t0;
2979     int done = 0;
2980
2981     if (blk_mig_bulk_active()) {
2982         /* Avoid transferring ram during bulk phase of block migration as
2983          * the bulk phase will usually take a long time and transferring
2984          * ram updates during that time is pointless. */
2985         goto out;
2986     }
2987
2988     rcu_read_lock();
2989     if (ram_list.version != rs->last_version) {
2990         ram_state_reset(rs);
2991     }
2992
2993     /* Read version before ram_list.blocks */
2994     smp_rmb();
2995
2996     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2997
2998     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2999     i = 0;
3000     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3001             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3002         int pages;
3003
3004         if (qemu_file_get_error(f)) {
3005             break;
3006         }
3007
3008         pages = ram_find_and_save_block(rs, false);
3009         /* no more pages to sent */
3010         if (pages == 0) {
3011             done = 1;
3012             break;
3013         }
3014         rs->iterations++;
3015
3016         /* we want to check in the 1st loop, just in case it was the 1st time
3017            and we had to sync the dirty bitmap.
3018            qemu_get_clock_ns() is a bit expensive, so we only check each some
3019            iterations
3020         */
3021         if ((i & 63) == 0) {
3022             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3023             if (t1 > MAX_WAIT) {
3024                 trace_ram_save_iterate_big_wait(t1, i);
3025                 break;
3026             }
3027         }
3028         i++;
3029     }
3030     flush_compressed_data(rs);
3031     rcu_read_unlock();
3032
3033     /*
3034      * Must occur before EOS (or any QEMUFile operation)
3035      * because of RDMA protocol.
3036      */
3037     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3038
3039     multifd_send_sync_main();
3040 out:
3041     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3042     ram_counters.transferred += 8;
3043
3044     ret = qemu_file_get_error(f);
3045     if (ret < 0) {
3046         return ret;
3047     }
3048
3049     return done;
3050 }
3051
3052 /**
3053  * ram_save_complete: function called to send the remaining amount of ram
3054  *
3055  * Returns zero to indicate success
3056  *
3057  * Called with iothread lock
3058  *
3059  * @f: QEMUFile where to send the data
3060  * @opaque: RAMState pointer
3061  */
3062 static int ram_save_complete(QEMUFile *f, void *opaque)
3063 {
3064     RAMState **temp = opaque;
3065     RAMState *rs = *temp;
3066
3067     rcu_read_lock();
3068
3069     if (!migration_in_postcopy()) {
3070         migration_bitmap_sync(rs);
3071     }
3072
3073     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3074
3075     /* try transferring iterative blocks of memory */
3076
3077     /* flush all remaining blocks regardless of rate limiting */
3078     while (true) {
3079         int pages;
3080
3081         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3082         /* no more blocks to sent */
3083         if (pages == 0) {
3084             break;
3085         }
3086     }
3087
3088     flush_compressed_data(rs);
3089     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3090
3091     rcu_read_unlock();
3092
3093     multifd_send_sync_main();
3094     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3095
3096     return 0;
3097 }
3098
3099 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3100                              uint64_t *res_precopy_only,
3101                              uint64_t *res_compatible,
3102                              uint64_t *res_postcopy_only)
3103 {
3104     RAMState **temp = opaque;
3105     RAMState *rs = *temp;
3106     uint64_t remaining_size;
3107
3108     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3109
3110     if (!migration_in_postcopy() &&
3111         remaining_size < max_size) {
3112         qemu_mutex_lock_iothread();
3113         rcu_read_lock();
3114         migration_bitmap_sync(rs);
3115         rcu_read_unlock();
3116         qemu_mutex_unlock_iothread();
3117         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3118     }
3119
3120     if (migrate_postcopy_ram()) {
3121         /* We can do postcopy, and all the data is postcopiable */
3122         *res_compatible += remaining_size;
3123     } else {
3124         *res_precopy_only += remaining_size;
3125     }
3126 }
3127
3128 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3129 {
3130     unsigned int xh_len;
3131     int xh_flags;
3132     uint8_t *loaded_data;
3133
3134     /* extract RLE header */
3135     xh_flags = qemu_get_byte(f);
3136     xh_len = qemu_get_be16(f);
3137
3138     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3139         error_report("Failed to load XBZRLE page - wrong compression!");
3140         return -1;
3141     }
3142
3143     if (xh_len > TARGET_PAGE_SIZE) {
3144         error_report("Failed to load XBZRLE page - len overflow!");
3145         return -1;
3146     }
3147     loaded_data = XBZRLE.decoded_buf;
3148     /* load data and decode */
3149     /* it can change loaded_data to point to an internal buffer */
3150     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3151
3152     /* decode RLE */
3153     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3154                              TARGET_PAGE_SIZE) == -1) {
3155         error_report("Failed to load XBZRLE page - decode error!");
3156         return -1;
3157     }
3158
3159     return 0;
3160 }
3161
3162 /**
3163  * ram_block_from_stream: read a RAMBlock id from the migration stream
3164  *
3165  * Must be called from within a rcu critical section.
3166  *
3167  * Returns a pointer from within the RCU-protected ram_list.
3168  *
3169  * @f: QEMUFile where to read the data from
3170  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3171  */
3172 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3173 {
3174     static RAMBlock *block = NULL;
3175     char id[256];
3176     uint8_t len;
3177
3178     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3179         if (!block) {
3180             error_report("Ack, bad migration stream!");
3181             return NULL;
3182         }
3183         return block;
3184     }
3185
3186     len = qemu_get_byte(f);
3187     qemu_get_buffer(f, (uint8_t *)id, len);
3188     id[len] = 0;
3189
3190     block = qemu_ram_block_by_name(id);
3191     if (!block) {
3192         error_report("Can't find block %s", id);
3193         return NULL;
3194     }
3195
3196     if (!qemu_ram_is_migratable(block)) {
3197         error_report("block %s should not be migrated !", id);
3198         return NULL;
3199     }
3200
3201     return block;
3202 }
3203
3204 static inline void *host_from_ram_block_offset(RAMBlock *block,
3205                                                ram_addr_t offset)
3206 {
3207     if (!offset_in_ramblock(block, offset)) {
3208         return NULL;
3209     }
3210
3211     return block->host + offset;
3212 }
3213
3214 /**
3215  * ram_handle_compressed: handle the zero page case
3216  *
3217  * If a page (or a whole RDMA chunk) has been
3218  * determined to be zero, then zap it.
3219  *
3220  * @host: host address for the zero page
3221  * @ch: what the page is filled from.  We only support zero
3222  * @size: size of the zero page
3223  */
3224 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3225 {
3226     if (ch != 0 || !is_zero_range(host, size)) {
3227         memset(host, ch, size);
3228     }
3229 }
3230
3231 /* return the size after decompression, or negative value on error */
3232 static int
3233 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3234                      const uint8_t *source, size_t source_len)
3235 {
3236     int err;
3237
3238     err = inflateReset(stream);
3239     if (err != Z_OK) {
3240         return -1;
3241     }
3242
3243     stream->avail_in = source_len;
3244     stream->next_in = (uint8_t *)source;
3245     stream->avail_out = dest_len;
3246     stream->next_out = dest;
3247
3248     err = inflate(stream, Z_NO_FLUSH);
3249     if (err != Z_STREAM_END) {
3250         return -1;
3251     }
3252
3253     return stream->total_out;
3254 }
3255
3256 static void *do_data_decompress(void *opaque)
3257 {
3258     DecompressParam *param = opaque;
3259     unsigned long pagesize;
3260     uint8_t *des;
3261     int len, ret;
3262
3263     qemu_mutex_lock(&param->mutex);
3264     while (!param->quit) {
3265         if (param->des) {
3266             des = param->des;
3267             len = param->len;
3268             param->des = 0;
3269             qemu_mutex_unlock(&param->mutex);
3270
3271             pagesize = TARGET_PAGE_SIZE;
3272
3273             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3274                                        param->compbuf, len);
3275             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3276                 error_report("decompress data failed");
3277                 qemu_file_set_error(decomp_file, ret);
3278             }
3279
3280             qemu_mutex_lock(&decomp_done_lock);
3281             param->done = true;
3282             qemu_cond_signal(&decomp_done_cond);
3283             qemu_mutex_unlock(&decomp_done_lock);
3284
3285             qemu_mutex_lock(&param->mutex);
3286         } else {
3287             qemu_cond_wait(&param->cond, &param->mutex);
3288         }
3289     }
3290     qemu_mutex_unlock(&param->mutex);
3291
3292     return NULL;
3293 }
3294
3295 static int wait_for_decompress_done(void)
3296 {
3297     int idx, thread_count;
3298
3299     if (!migrate_use_compression()) {
3300         return 0;
3301     }
3302
3303     thread_count = migrate_decompress_threads();
3304     qemu_mutex_lock(&decomp_done_lock);
3305     for (idx = 0; idx < thread_count; idx++) {
3306         while (!decomp_param[idx].done) {
3307             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3308         }
3309     }
3310     qemu_mutex_unlock(&decomp_done_lock);
3311     return qemu_file_get_error(decomp_file);
3312 }
3313
3314 static void compress_threads_load_cleanup(void)
3315 {
3316     int i, thread_count;
3317
3318     if (!migrate_use_compression()) {
3319         return;
3320     }
3321     thread_count = migrate_decompress_threads();
3322     for (i = 0; i < thread_count; i++) {
3323         /*
3324          * we use it as a indicator which shows if the thread is
3325          * properly init'd or not
3326          */
3327         if (!decomp_param[i].compbuf) {
3328             break;
3329         }
3330
3331         qemu_mutex_lock(&decomp_param[i].mutex);
3332         decomp_param[i].quit = true;
3333         qemu_cond_signal(&decomp_param[i].cond);
3334         qemu_mutex_unlock(&decomp_param[i].mutex);
3335     }
3336     for (i = 0; i < thread_count; i++) {
3337         if (!decomp_param[i].compbuf) {
3338             break;
3339         }
3340
3341         qemu_thread_join(decompress_threads + i);
3342         qemu_mutex_destroy(&decomp_param[i].mutex);
3343         qemu_cond_destroy(&decomp_param[i].cond);
3344         inflateEnd(&decomp_param[i].stream);
3345         g_free(decomp_param[i].compbuf);
3346         decomp_param[i].compbuf = NULL;
3347     }
3348     g_free(decompress_threads);
3349     g_free(decomp_param);
3350     decompress_threads = NULL;
3351     decomp_param = NULL;
3352     decomp_file = NULL;
3353 }
3354
3355 static int compress_threads_load_setup(QEMUFile *f)
3356 {
3357     int i, thread_count;
3358
3359     if (!migrate_use_compression()) {
3360         return 0;
3361     }
3362
3363     thread_count = migrate_decompress_threads();
3364     decompress_threads = g_new0(QemuThread, thread_count);
3365     decomp_param = g_new0(DecompressParam, thread_count);
3366     qemu_mutex_init(&decomp_done_lock);
3367     qemu_cond_init(&decomp_done_cond);
3368     decomp_file = f;
3369     for (i = 0; i < thread_count; i++) {
3370         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3371             goto exit;
3372         }
3373
3374         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3375         qemu_mutex_init(&decomp_param[i].mutex);
3376         qemu_cond_init(&decomp_param[i].cond);
3377         decomp_param[i].done = true;
3378         decomp_param[i].quit = false;
3379         qemu_thread_create(decompress_threads + i, "decompress",
3380                            do_data_decompress, decomp_param + i,
3381                            QEMU_THREAD_JOINABLE);
3382     }
3383     return 0;
3384 exit:
3385     compress_threads_load_cleanup();
3386     return -1;
3387 }
3388
3389 static void decompress_data_with_multi_threads(QEMUFile *f,
3390                                                void *host, int len)
3391 {
3392     int idx, thread_count;
3393
3394     thread_count = migrate_decompress_threads();
3395     qemu_mutex_lock(&decomp_done_lock);
3396     while (true) {
3397         for (idx = 0; idx < thread_count; idx++) {
3398             if (decomp_param[idx].done) {
3399                 decomp_param[idx].done = false;
3400                 qemu_mutex_lock(&decomp_param[idx].mutex);
3401                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3402                 decomp_param[idx].des = host;
3403                 decomp_param[idx].len = len;
3404                 qemu_cond_signal(&decomp_param[idx].cond);
3405                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3406                 break;
3407             }
3408         }
3409         if (idx < thread_count) {
3410             break;
3411         } else {
3412             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3413         }
3414     }
3415     qemu_mutex_unlock(&decomp_done_lock);
3416 }
3417
3418 /**
3419  * ram_load_setup: Setup RAM for migration incoming side
3420  *
3421  * Returns zero to indicate success and negative for error
3422  *
3423  * @f: QEMUFile where to receive the data
3424  * @opaque: RAMState pointer
3425  */
3426 static int ram_load_setup(QEMUFile *f, void *opaque)
3427 {
3428     if (compress_threads_load_setup(f)) {
3429         return -1;
3430     }
3431
3432     xbzrle_load_setup();
3433     ramblock_recv_map_init();
3434     return 0;
3435 }
3436
3437 static int ram_load_cleanup(void *opaque)
3438 {
3439     RAMBlock *rb;
3440     xbzrle_load_cleanup();
3441     compress_threads_load_cleanup();
3442
3443     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3444         g_free(rb->receivedmap);
3445         rb->receivedmap = NULL;
3446     }
3447     return 0;
3448 }
3449
3450 /**
3451  * ram_postcopy_incoming_init: allocate postcopy data structures
3452  *
3453  * Returns 0 for success and negative if there was one error
3454  *
3455  * @mis: current migration incoming state
3456  *
3457  * Allocate data structures etc needed by incoming migration with
3458  * postcopy-ram. postcopy-ram's similarly names
3459  * postcopy_ram_incoming_init does the work.
3460  */
3461 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3462 {
3463     unsigned long ram_pages = last_ram_page();
3464
3465     return postcopy_ram_incoming_init(mis, ram_pages);
3466 }
3467
3468 /**
3469  * ram_load_postcopy: load a page in postcopy case
3470  *
3471  * Returns 0 for success or -errno in case of error
3472  *
3473  * Called in postcopy mode by ram_load().
3474  * rcu_read_lock is taken prior to this being called.
3475  *
3476  * @f: QEMUFile where to send the data
3477  */
3478 static int ram_load_postcopy(QEMUFile *f)
3479 {
3480     int flags = 0, ret = 0;
3481     bool place_needed = false;
3482     bool matching_page_sizes = false;
3483     MigrationIncomingState *mis = migration_incoming_get_current();
3484     /* Temporary page that is later 'placed' */
3485     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3486     void *last_host = NULL;
3487     bool all_zero = false;
3488
3489     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3490         ram_addr_t addr;
3491         void *host = NULL;
3492         void *page_buffer = NULL;
3493         void *place_source = NULL;
3494         RAMBlock *block = NULL;
3495         uint8_t ch;
3496
3497         addr = qemu_get_be64(f);
3498
3499         /*
3500          * If qemu file error, we should stop here, and then "addr"
3501          * may be invalid
3502          */
3503         ret = qemu_file_get_error(f);
3504         if (ret) {
3505             break;
3506         }
3507
3508         flags = addr & ~TARGET_PAGE_MASK;
3509         addr &= TARGET_PAGE_MASK;
3510
3511         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3512         place_needed = false;
3513         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3514             block = ram_block_from_stream(f, flags);
3515
3516             host = host_from_ram_block_offset(block, addr);
3517             if (!host) {
3518                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3519                 ret = -EINVAL;
3520                 break;
3521             }
3522             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3523             /*
3524              * Postcopy requires that we place whole host pages atomically;
3525              * these may be huge pages for RAMBlocks that are backed by
3526              * hugetlbfs.
3527              * To make it atomic, the data is read into a temporary page
3528              * that's moved into place later.
3529              * The migration protocol uses,  possibly smaller, target-pages
3530              * however the source ensures it always sends all the components
3531              * of a host page in order.
3532              */
3533             page_buffer = postcopy_host_page +
3534                           ((uintptr_t)host & (block->page_size - 1));
3535             /* If all TP are zero then we can optimise the place */
3536             if (!((uintptr_t)host & (block->page_size - 1))) {
3537                 all_zero = true;
3538             } else {
3539                 /* not the 1st TP within the HP */
3540                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3541                     error_report("Non-sequential target page %p/%p",
3542                                   host, last_host);
3543                     ret = -EINVAL;
3544                     break;
3545                 }
3546             }
3547
3548
3549             /*
3550              * If it's the last part of a host page then we place the host
3551              * page
3552              */
3553             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3554                                      (block->page_size - 1)) == 0;
3555             place_source = postcopy_host_page;
3556         }
3557         last_host = host;
3558
3559         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3560         case RAM_SAVE_FLAG_ZERO:
3561             ch = qemu_get_byte(f);
3562             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3563             if (ch) {
3564                 all_zero = false;
3565             }
3566             break;
3567
3568         case RAM_SAVE_FLAG_PAGE:
3569             all_zero = false;
3570             if (!place_needed || !matching_page_sizes) {
3571                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3572             } else {
3573                 /* Avoids the qemu_file copy during postcopy, which is
3574                  * going to do a copy later; can only do it when we
3575                  * do this read in one go (matching page sizes)
3576                  */
3577                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3578                                          TARGET_PAGE_SIZE);
3579             }
3580             break;
3581         case RAM_SAVE_FLAG_EOS:
3582             /* normal exit */
3583             multifd_recv_sync_main();
3584             break;
3585         default:
3586             error_report("Unknown combination of migration flags: %#x"
3587                          " (postcopy mode)", flags);
3588             ret = -EINVAL;
3589             break;
3590         }
3591
3592         /* Detect for any possible file errors */
3593         if (!ret && qemu_file_get_error(f)) {
3594             ret = qemu_file_get_error(f);
3595         }
3596
3597         if (!ret && place_needed) {
3598             /* This gets called at the last target page in the host page */
3599             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3600
3601             if (all_zero) {
3602                 ret = postcopy_place_page_zero(mis, place_dest,
3603                                                block);
3604             } else {
3605                 ret = postcopy_place_page(mis, place_dest,
3606                                           place_source, block);
3607             }
3608         }
3609     }
3610
3611     return ret;
3612 }
3613
3614 static bool postcopy_is_advised(void)
3615 {
3616     PostcopyState ps = postcopy_state_get();
3617     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3618 }
3619
3620 static bool postcopy_is_running(void)
3621 {
3622     PostcopyState ps = postcopy_state_get();
3623     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3624 }
3625
3626 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3627 {
3628     int flags = 0, ret = 0, invalid_flags = 0;
3629     static uint64_t seq_iter;
3630     int len = 0;
3631     /*
3632      * If system is running in postcopy mode, page inserts to host memory must
3633      * be atomic
3634      */
3635     bool postcopy_running = postcopy_is_running();
3636     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3637     bool postcopy_advised = postcopy_is_advised();
3638
3639     seq_iter++;
3640
3641     if (version_id != 4) {
3642         ret = -EINVAL;
3643     }
3644
3645     if (!migrate_use_compression()) {
3646         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3647     }
3648     /* This RCU critical section can be very long running.
3649      * When RCU reclaims in the code start to become numerous,
3650      * it will be necessary to reduce the granularity of this
3651      * critical section.
3652      */
3653     rcu_read_lock();
3654
3655     if (postcopy_running) {
3656         ret = ram_load_postcopy(f);
3657     }
3658
3659     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3660         ram_addr_t addr, total_ram_bytes;
3661         void *host = NULL;
3662         uint8_t ch;
3663
3664         addr = qemu_get_be64(f);
3665         flags = addr & ~TARGET_PAGE_MASK;
3666         addr &= TARGET_PAGE_MASK;
3667
3668         if (flags & invalid_flags) {
3669             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3670                 error_report("Received an unexpected compressed page");
3671             }
3672
3673             ret = -EINVAL;
3674             break;
3675         }
3676
3677         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3678                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3679             RAMBlock *block = ram_block_from_stream(f, flags);
3680
3681             host = host_from_ram_block_offset(block, addr);
3682             if (!host) {
3683                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3684                 ret = -EINVAL;
3685                 break;
3686             }
3687             ramblock_recv_bitmap_set(block, host);
3688             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3689         }
3690
3691         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3692         case RAM_SAVE_FLAG_MEM_SIZE:
3693             /* Synchronize RAM block list */
3694             total_ram_bytes = addr;
3695             while (!ret && total_ram_bytes) {
3696                 RAMBlock *block;
3697                 char id[256];
3698                 ram_addr_t length;
3699
3700                 len = qemu_get_byte(f);
3701                 qemu_get_buffer(f, (uint8_t *)id, len);
3702                 id[len] = 0;
3703                 length = qemu_get_be64(f);
3704
3705                 block = qemu_ram_block_by_name(id);
3706                 if (block && !qemu_ram_is_migratable(block)) {
3707                     error_report("block %s should not be migrated !", id);
3708                     ret = -EINVAL;
3709                 } else if (block) {
3710                     if (length != block->used_length) {
3711                         Error *local_err = NULL;
3712
3713                         ret = qemu_ram_resize(block, length,
3714                                               &local_err);
3715                         if (local_err) {
3716                             error_report_err(local_err);
3717                         }
3718                     }
3719                     /* For postcopy we need to check hugepage sizes match */
3720                     if (postcopy_advised &&
3721                         block->page_size != qemu_host_page_size) {
3722                         uint64_t remote_page_size = qemu_get_be64(f);
3723                         if (remote_page_size != block->page_size) {
3724                             error_report("Mismatched RAM page size %s "
3725                                          "(local) %zd != %" PRId64,
3726                                          id, block->page_size,
3727                                          remote_page_size);
3728                             ret = -EINVAL;
3729                         }
3730                     }
3731                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3732                                           block->idstr);
3733                 } else {
3734                     error_report("Unknown ramblock \"%s\", cannot "
3735                                  "accept migration", id);
3736                     ret = -EINVAL;
3737                 }
3738
3739                 total_ram_bytes -= length;
3740             }
3741             break;
3742
3743         case RAM_SAVE_FLAG_ZERO:
3744             ch = qemu_get_byte(f);
3745             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3746             break;
3747
3748         case RAM_SAVE_FLAG_PAGE:
3749             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3750             break;
3751
3752         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3753             len = qemu_get_be32(f);
3754             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3755                 error_report("Invalid compressed data length: %d", len);
3756                 ret = -EINVAL;
3757                 break;
3758             }
3759             decompress_data_with_multi_threads(f, host, len);
3760             break;
3761
3762         case RAM_SAVE_FLAG_XBZRLE:
3763             if (load_xbzrle(f, addr, host) < 0) {
3764                 error_report("Failed to decompress XBZRLE page at "
3765                              RAM_ADDR_FMT, addr);
3766                 ret = -EINVAL;
3767                 break;
3768             }
3769             break;
3770         case RAM_SAVE_FLAG_EOS:
3771             /* normal exit */
3772             multifd_recv_sync_main();
3773             break;
3774         default:
3775             if (flags & RAM_SAVE_FLAG_HOOK) {
3776                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3777             } else {
3778                 error_report("Unknown combination of migration flags: %#x",
3779                              flags);
3780                 ret = -EINVAL;
3781             }
3782         }
3783         if (!ret) {
3784             ret = qemu_file_get_error(f);
3785         }
3786     }
3787
3788     ret |= wait_for_decompress_done();
3789     rcu_read_unlock();
3790     trace_ram_load_complete(ret, seq_iter);
3791     return ret;
3792 }
3793
3794 static bool ram_has_postcopy(void *opaque)
3795 {
3796     return migrate_postcopy_ram();
3797 }
3798
3799 /* Sync all the dirty bitmap with destination VM.  */
3800 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3801 {
3802     RAMBlock *block;
3803     QEMUFile *file = s->to_dst_file;
3804     int ramblock_count = 0;
3805
3806     trace_ram_dirty_bitmap_sync_start();
3807
3808     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3809         qemu_savevm_send_recv_bitmap(file, block->idstr);
3810         trace_ram_dirty_bitmap_request(block->idstr);
3811         ramblock_count++;
3812     }
3813
3814     trace_ram_dirty_bitmap_sync_wait();
3815
3816     /* Wait until all the ramblocks' dirty bitmap synced */
3817     while (ramblock_count--) {
3818         qemu_sem_wait(&s->rp_state.rp_sem);
3819     }
3820
3821     trace_ram_dirty_bitmap_sync_complete();
3822
3823     return 0;
3824 }
3825
3826 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3827 {
3828     qemu_sem_post(&s->rp_state.rp_sem);
3829 }
3830
3831 /*
3832  * Read the received bitmap, revert it as the initial dirty bitmap.
3833  * This is only used when the postcopy migration is paused but wants
3834  * to resume from a middle point.
3835  */
3836 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3837 {
3838     int ret = -EINVAL;
3839     QEMUFile *file = s->rp_state.from_dst_file;
3840     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3841     uint64_t local_size = nbits / 8;
3842     uint64_t size, end_mark;
3843
3844     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3845
3846     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3847         error_report("%s: incorrect state %s", __func__,
3848                      MigrationStatus_str(s->state));
3849         return -EINVAL;
3850     }
3851
3852     /*
3853      * Note: see comments in ramblock_recv_bitmap_send() on why we
3854      * need the endianess convertion, and the paddings.
3855      */
3856     local_size = ROUND_UP(local_size, 8);
3857
3858     /* Add paddings */
3859     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3860
3861     size = qemu_get_be64(file);
3862
3863     /* The size of the bitmap should match with our ramblock */
3864     if (size != local_size) {
3865         error_report("%s: ramblock '%s' bitmap size mismatch "
3866                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3867                      block->idstr, size, local_size);
3868         ret = -EINVAL;
3869         goto out;
3870     }
3871
3872     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3873     end_mark = qemu_get_be64(file);
3874
3875     ret = qemu_file_get_error(file);
3876     if (ret || size != local_size) {
3877         error_report("%s: read bitmap failed for ramblock '%s': %d"
3878                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3879                      __func__, block->idstr, ret, local_size, size);
3880         ret = -EIO;
3881         goto out;
3882     }
3883
3884     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3885         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3886                      __func__, block->idstr, end_mark);
3887         ret = -EINVAL;
3888         goto out;
3889     }
3890
3891     /*
3892      * Endianess convertion. We are during postcopy (though paused).
3893      * The dirty bitmap won't change. We can directly modify it.
3894      */
3895     bitmap_from_le(block->bmap, le_bitmap, nbits);
3896
3897     /*
3898      * What we received is "received bitmap". Revert it as the initial
3899      * dirty bitmap for this ramblock.
3900      */
3901     bitmap_complement(block->bmap, block->bmap, nbits);
3902
3903     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3904
3905     /*
3906      * We succeeded to sync bitmap for current ramblock. If this is
3907      * the last one to sync, we need to notify the main send thread.
3908      */
3909     ram_dirty_bitmap_reload_notify(s);
3910
3911     ret = 0;
3912 out:
3913     g_free(le_bitmap);
3914     return ret;
3915 }
3916
3917 static int ram_resume_prepare(MigrationState *s, void *opaque)
3918 {
3919     RAMState *rs = *(RAMState **)opaque;
3920     int ret;
3921
3922     ret = ram_dirty_bitmap_sync_all(s, rs);
3923     if (ret) {
3924         return ret;
3925     }
3926
3927     ram_state_resume_prepare(rs, s->to_dst_file);
3928
3929     return 0;
3930 }
3931
3932 static SaveVMHandlers savevm_ram_handlers = {
3933     .save_setup = ram_save_setup,
3934     .save_live_iterate = ram_save_iterate,
3935     .save_live_complete_postcopy = ram_save_complete,
3936     .save_live_complete_precopy = ram_save_complete,
3937     .has_postcopy = ram_has_postcopy,
3938     .save_live_pending = ram_save_pending,
3939     .load_state = ram_load,
3940     .save_cleanup = ram_save_cleanup,
3941     .load_setup = ram_load_setup,
3942     .load_cleanup = ram_load_cleanup,
3943     .resume_prepare = ram_resume_prepare,
3944 };
3945
3946 void ram_mig_init(void)
3947 {
3948     qemu_mutex_init(&XBZRLE.lock);
3949     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3950 }