migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <[email protected]>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63  * worked for pages that where filled with the same char.  We switched
  64  * it to only search for the zero value.  And to avoid confusion with
  65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66  */
  67
  68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69 #define RAM_SAVE_FLAG_ZERO     0x02
  70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71 #define RAM_SAVE_FLAG_PAGE     0x08
  72 #define RAM_SAVE_FLAG_EOS      0x10
  73 #define RAM_SAVE_FLAG_CONTINUE 0x20
  74 #define RAM_SAVE_FLAG_XBZRLE   0x40
  75 /* 0x80 is reserved in migration.h start with 0x100 next */
  76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79 {
  80     return buffer_is_zero(p, size);
  81 }
  82
  83 XBZRLECacheStats xbzrle_counters;
  84
  85 /* struct contains XBZRLE cache and a static page
  86    used by the compression */
  87 static struct {
  88     /* buffer used for XBZRLE encoding */
  89     uint8_t *encoded_buf;
  90     /* buffer for storing page content */
  91     uint8_t *current_buf;
  92     /* Cache for XBZRLE, Protected by lock. */
  93     PageCache *cache;
  94     QemuMutex lock;
  95     /* it will store a page full of zeros */
  96     uint8_t *zero_target_page;
  97     /* buffer used for XBZRLE decoding */
  98     uint8_t *decoded_buf;
  99 } XBZRLE;
 100
 101 static void XBZRLE_cache_lock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_lock(&XBZRLE.lock);
 105 }
 106
 107 static void XBZRLE_cache_unlock(void)
 108 {
 109     if (migrate_use_xbzrle())
 110         qemu_mutex_unlock(&XBZRLE.lock);
 111 }
 112
 113 /**
 114  * xbzrle_cache_resize: resize the xbzrle cache
 115  *
 116  * This function is called from qmp_migrate_set_cache_size in main
 117  * thread, possibly while a migration is in progress.  A running
 118  * migration may be using the cache and might finish during this call,
 119  * hence changes to the cache are protected by XBZRLE.lock().
 120  *
 121  * Returns 0 for success or -1 for error
 122  *
 123  * @new_size: new cache size
 124  * @errp: set *errp if the check failed, with reason
 125  */
 126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127 {
 128     PageCache *new_cache;
 129     int64_t ret = 0;
 130
 131     /* Check for truncation */
 132     if (new_size != (size_t)new_size) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeding address space");
 135         return -1;
 136     }
 137
 138     if (new_size == migrate_xbzrle_cache_size()) {
 139         /* nothing to do */
 140         return 0;
 141     }
 142
 143     XBZRLE_cache_lock();
 144
 145     if (XBZRLE.cache != NULL) {
 146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147         if (!new_cache) {
 148             ret = -1;
 149             goto out;
 150         }
 151
 152         cache_fini(XBZRLE.cache);
 153         XBZRLE.cache = new_cache;
 154     }
 155 out:
 156     XBZRLE_cache_unlock();
 157     return ret;
 158 }
 159
 160 /* Should be holding either ram_list.mutex, or the RCU lock. */
 161 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 162     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 163         if (!qemu_ram_is_migratable(block)) {} else
 164
 165 #undef RAMBLOCK_FOREACH
 166
 167 static void ramblock_recv_map_init(void)
 168 {
 169     RAMBlock *rb;
 170
 171     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 172         assert(!rb->receivedmap);
 173         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 174     }
 175 }
 176
 177 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 178 {
 179     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 180                     rb->receivedmap);
 181 }
 182
 183 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 184 {
 185     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 186 }
 187
 188 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 189 {
 190     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 191 }
 192
 193 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 194                                     size_t nr)
 195 {
 196     bitmap_set_atomic(rb->receivedmap,
 197                       ramblock_recv_bitmap_offset(host_addr, rb),
 198                       nr);
 199 }
 200
 201 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 202
 203 /*
 204  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 205  *
 206  * Returns >0 if success with sent bytes, or <0 if error.
 207  */
 208 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 209                                   const char *block_name)
 210 {
 211     RAMBlock *block = qemu_ram_block_by_name(block_name);
 212     unsigned long *le_bitmap, nbits;
 213     uint64_t size;
 214
 215     if (!block) {
 216         error_report("%s: invalid block name: %s", __func__, block_name);
 217         return -1;
 218     }
 219
 220     nbits = block->used_length >> TARGET_PAGE_BITS;
 221
 222     /*
 223      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 224      * machines we may need 4 more bytes for padding (see below
 225      * comment). So extend it a bit before hand.
 226      */
 227     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 228
 229     /*
 230      * Always use little endian when sending the bitmap. This is
 231      * required that when source and destination VMs are not using the
 232      * same endianess. (Note: big endian won't work.)
 233      */
 234     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 235
 236     /* Size of the bitmap, in bytes */
 237     size = nbits / 8;
 238
 239     /*
 240      * size is always aligned to 8 bytes for 64bit machines, but it
 241      * may not be true for 32bit machines. We need this padding to
 242      * make sure the migration can survive even between 32bit and
 243      * 64bit machines.
 244      */
 245     size = ROUND_UP(size, 8);
 246
 247     qemu_put_be64(file, size);
 248     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 249     /*
 250      * Mark as an end, in case the middle part is screwed up due to
 251      * some "misterious" reason.
 252      */
 253     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 254     qemu_fflush(file);
 255
 256     g_free(le_bitmap);
 257
 258     if (qemu_file_get_error(file)) {
 259         return qemu_file_get_error(file);
 260     }
 261
 262     return size + sizeof(size);
 263 }
 264
 265 /*
 266  * An outstanding page request, on the source, having been received
 267  * and queued
 268  */
 269 struct RAMSrcPageRequest {
 270     RAMBlock *rb;
 271     hwaddr    offset;
 272     hwaddr    len;
 273
 274     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 275 };
 276
 277 /* State of RAM for migration */
 278 struct RAMState {
 279     /* QEMUFile used for this migration */
 280     QEMUFile *f;
 281     /* Last block that we have visited searching for dirty pages */
 282     RAMBlock *last_seen_block;
 283     /* Last block from where we have sent data */
 284     RAMBlock *last_sent_block;
 285     /* Last dirty target page we have sent */
 286     ram_addr_t last_page;
 287     /* last ram version we have seen */
 288     uint32_t last_version;
 289     /* We are in the first round */
 290     bool ram_bulk_stage;
 291     /* How many times we have dirty too many pages */
 292     int dirty_rate_high_cnt;
 293     /* these variables are used for bitmap sync */
 294     /* last time we did a full bitmap_sync */
 295     int64_t time_last_bitmap_sync;
 296     /* bytes transferred at start_time */
 297     uint64_t bytes_xfer_prev;
 298     /* number of dirty pages since start_time */
 299     uint64_t num_dirty_pages_period;
 300     /* xbzrle misses since the beginning of the period */
 301     uint64_t xbzrle_cache_miss_prev;
 302     /* number of iterations at the beginning of period */
 303     uint64_t iterations_prev;
 304     /* Iterations since start */
 305     uint64_t iterations;
 306     /* number of dirty bits in the bitmap */
 307     uint64_t migration_dirty_pages;
 308     /* protects modification of the bitmap */
 309     QemuMutex bitmap_mutex;
 310     /* The RAMBlock used in the last src_page_requests */
 311     RAMBlock *last_req_rb;
 312     /* Queue of outstanding page requests from the destination */
 313     QemuMutex src_page_req_mutex;
 314     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 315 };
 316 typedef struct RAMState RAMState;
 317
 318 static RAMState *ram_state;
 319
 320 uint64_t ram_bytes_remaining(void)
 321 {
 322     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 323                        0;
 324 }
 325
 326 MigrationStats ram_counters;
 327
 328 /* used by the search for pages to send */
 329 struct PageSearchStatus {
 330     /* Current block being searched */
 331     RAMBlock    *block;
 332     /* Current page to search from */
 333     unsigned long page;
 334     /* Set once we wrap around */
 335     bool         complete_round;
 336 };
 337 typedef struct PageSearchStatus PageSearchStatus;
 338
 339 struct CompressParam {
 340     bool done;
 341     bool quit;
 342     QEMUFile *file;
 343     QemuMutex mutex;
 344     QemuCond cond;
 345     RAMBlock *block;
 346     ram_addr_t offset;
 347
 348     /* internally used fields */
 349     z_stream stream;
 350     uint8_t *originbuf;
 351 };
 352 typedef struct CompressParam CompressParam;
 353
 354 struct DecompressParam {
 355     bool done;
 356     bool quit;
 357     QemuMutex mutex;
 358     QemuCond cond;
 359     void *des;
 360     uint8_t *compbuf;
 361     int len;
 362     z_stream stream;
 363 };
 364 typedef struct DecompressParam DecompressParam;
 365
 366 static CompressParam *comp_param;
 367 static QemuThread *compress_threads;
 368 /* comp_done_cond is used to wake up the migration thread when
 369  * one of the compression threads has finished the compression.
 370  * comp_done_lock is used to co-work with comp_done_cond.
 371  */
 372 static QemuMutex comp_done_lock;
 373 static QemuCond comp_done_cond;
 374 /* The empty QEMUFileOps will be used by file in CompressParam */
 375 static const QEMUFileOps empty_ops = { };
 376
 377 static QEMUFile *decomp_file;
 378 static DecompressParam *decomp_param;
 379 static QemuThread *decompress_threads;
 380 static QemuMutex decomp_done_lock;
 381 static QemuCond decomp_done_cond;
 382
 383 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 384                                 ram_addr_t offset, uint8_t *source_buf);
 385
 386 static void *do_data_compress(void *opaque)
 387 {
 388     CompressParam *param = opaque;
 389     RAMBlock *block;
 390     ram_addr_t offset;
 391
 392     qemu_mutex_lock(&param->mutex);
 393     while (!param->quit) {
 394         if (param->block) {
 395             block = param->block;
 396             offset = param->offset;
 397             param->block = NULL;
 398             qemu_mutex_unlock(&param->mutex);
 399
 400             do_compress_ram_page(param->file, &param->stream, block, offset,
 401                                  param->originbuf);
 402
 403             qemu_mutex_lock(&comp_done_lock);
 404             param->done = true;
 405             qemu_cond_signal(&comp_done_cond);
 406             qemu_mutex_unlock(&comp_done_lock);
 407
 408             qemu_mutex_lock(&param->mutex);
 409         } else {
 410             qemu_cond_wait(&param->cond, &param->mutex);
 411         }
 412     }
 413     qemu_mutex_unlock(&param->mutex);
 414
 415     return NULL;
 416 }
 417
 418 static inline void terminate_compression_threads(void)
 419 {
 420     int idx, thread_count;
 421
 422     thread_count = migrate_compress_threads();
 423
 424     for (idx = 0; idx < thread_count; idx++) {
 425         qemu_mutex_lock(&comp_param[idx].mutex);
 426         comp_param[idx].quit = true;
 427         qemu_cond_signal(&comp_param[idx].cond);
 428         qemu_mutex_unlock(&comp_param[idx].mutex);
 429     }
 430 }
 431
 432 static void compress_threads_save_cleanup(void)
 433 {
 434     int i, thread_count;
 435
 436     if (!migrate_use_compression()) {
 437         return;
 438     }
 439     terminate_compression_threads();
 440     thread_count = migrate_compress_threads();
 441     for (i = 0; i < thread_count; i++) {
 442         /*
 443          * we use it as a indicator which shows if the thread is
 444          * properly init'd or not
 445          */
 446         if (!comp_param[i].file) {
 447             break;
 448         }
 449         qemu_thread_join(compress_threads + i);
 450         qemu_mutex_destroy(&comp_param[i].mutex);
 451         qemu_cond_destroy(&comp_param[i].cond);
 452         deflateEnd(&comp_param[i].stream);
 453         g_free(comp_param[i].originbuf);
 454         qemu_fclose(comp_param[i].file);
 455         comp_param[i].file = NULL;
 456     }
 457     qemu_mutex_destroy(&comp_done_lock);
 458     qemu_cond_destroy(&comp_done_cond);
 459     g_free(compress_threads);
 460     g_free(comp_param);
 461     compress_threads = NULL;
 462     comp_param = NULL;
 463 }
 464
 465 static int compress_threads_save_setup(void)
 466 {
 467     int i, thread_count;
 468
 469     if (!migrate_use_compression()) {
 470         return 0;
 471     }
 472     thread_count = migrate_compress_threads();
 473     compress_threads = g_new0(QemuThread, thread_count);
 474     comp_param = g_new0(CompressParam, thread_count);
 475     qemu_cond_init(&comp_done_cond);
 476     qemu_mutex_init(&comp_done_lock);
 477     for (i = 0; i < thread_count; i++) {
 478         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 479         if (!comp_param[i].originbuf) {
 480             goto exit;
 481         }
 482
 483         if (deflateInit(&comp_param[i].stream,
 484                         migrate_compress_level()) != Z_OK) {
 485             g_free(comp_param[i].originbuf);
 486             goto exit;
 487         }
 488
 489         /* comp_param[i].file is just used as a dummy buffer to save data,
 490          * set its ops to empty.
 491          */
 492         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 493         comp_param[i].done = true;
 494         comp_param[i].quit = false;
 495         qemu_mutex_init(&comp_param[i].mutex);
 496         qemu_cond_init(&comp_param[i].cond);
 497         qemu_thread_create(compress_threads + i, "compress",
 498                            do_data_compress, comp_param + i,
 499                            QEMU_THREAD_JOINABLE);
 500     }
 501     return 0;
 502
 503 exit:
 504     compress_threads_save_cleanup();
 505     return -1;
 506 }
 507
 508 /* Multiple fd's */
 509
 510 #define MULTIFD_MAGIC 0x11223344U
 511 #define MULTIFD_VERSION 1
 512
 513 typedef struct {
 514     uint32_t magic;
 515     uint32_t version;
 516     unsigned char uuid[16]; /* QemuUUID */
 517     uint8_t id;
 518 } __attribute__((packed)) MultiFDInit_t;
 519
 520 typedef struct {
 521     uint32_t magic;
 522     uint32_t version;
 523     uint32_t flags;
 524     uint32_t size;
 525     uint32_t used;
 526     uint64_t packet_num;
 527     char ramblock[256];
 528     uint64_t offset[];
 529 } __attribute__((packed)) MultiFDPacket_t;
 530
 531 typedef struct {
 532     /* number of used pages */
 533     uint32_t used;
 534     /* number of allocated pages */
 535     uint32_t allocated;
 536     /* global number of generated multifd packets */
 537     uint64_t packet_num;
 538     /* offset of each page */
 539     ram_addr_t *offset;
 540     /* pointer to each page */
 541     struct iovec *iov;
 542     RAMBlock *block;
 543 } MultiFDPages_t;
 544
 545 typedef struct {
 546     /* this fields are not changed once the thread is created */
 547     /* channel number */
 548     uint8_t id;
 549     /* channel thread name */
 550     char *name;
 551     /* channel thread id */
 552     QemuThread thread;
 553     /* communication channel */
 554     QIOChannel *c;
 555     /* sem where to wait for more work */
 556     QemuSemaphore sem;
 557     /* this mutex protects the following parameters */
 558     QemuMutex mutex;
 559     /* is this channel thread running */
 560     bool running;
 561     /* should this thread finish */
 562     bool quit;
 563     /* thread has work to do */
 564     int pending_job;
 565     /* array of pages to sent */
 566     MultiFDPages_t *pages;
 567     /* packet allocated len */
 568     uint32_t packet_len;
 569     /* pointer to the packet */
 570     MultiFDPacket_t *packet;
 571     /* multifd flags for each packet */
 572     uint32_t flags;
 573     /* global number of generated multifd packets */
 574     uint64_t packet_num;
 575     /* thread local variables */
 576     /* packets sent through this channel */
 577     uint64_t num_packets;
 578     /* pages sent through this channel */
 579     uint64_t num_pages;
 580 }  MultiFDSendParams;
 581
 582 typedef struct {
 583     /* this fields are not changed once the thread is created */
 584     /* channel number */
 585     uint8_t id;
 586     /* channel thread name */
 587     char *name;
 588     /* channel thread id */
 589     QemuThread thread;
 590     /* communication channel */
 591     QIOChannel *c;
 592     /* sem where to wait for more work */
 593     QemuSemaphore sem;
 594     /* this mutex protects the following parameters */
 595     QemuMutex mutex;
 596     /* is this channel thread running */
 597     bool running;
 598     /* should this thread finish */
 599     bool quit;
 600     /* thread has work to do */
 601     bool pending_job;
 602     /* array of pages to receive */
 603     MultiFDPages_t *pages;
 604     /* packet allocated len */
 605     uint32_t packet_len;
 606     /* pointer to the packet */
 607     MultiFDPacket_t *packet;
 608     /* multifd flags for each packet */
 609     uint32_t flags;
 610     /* global number of generated multifd packets */
 611     uint64_t packet_num;
 612     /* thread local variables */
 613     /* packets sent through this channel */
 614     uint64_t num_packets;
 615     /* pages sent through this channel */
 616     uint64_t num_pages;
 617 } MultiFDRecvParams;
 618
 619 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 620 {
 621     MultiFDInit_t msg;
 622     int ret;
 623
 624     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 625     msg.version = cpu_to_be32(MULTIFD_VERSION);
 626     msg.id = p->id;
 627     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 628
 629     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 630     if (ret != 0) {
 631         return -1;
 632     }
 633     return 0;
 634 }
 635
 636 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 637 {
 638     MultiFDInit_t msg;
 639     int ret;
 640
 641     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 642     if (ret != 0) {
 643         return -1;
 644     }
 645
 646     be32_to_cpus(&msg.magic);
 647     be32_to_cpus(&msg.version);
 648
 649     if (msg.magic != MULTIFD_MAGIC) {
 650         error_setg(errp, "multifd: received packet magic %x "
 651                    "expected %x", msg.magic, MULTIFD_MAGIC);
 652         return -1;
 653     }
 654
 655     if (msg.version != MULTIFD_VERSION) {
 656         error_setg(errp, "multifd: received packet version %d "
 657                    "expected %d", msg.version, MULTIFD_VERSION);
 658         return -1;
 659     }
 660
 661     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 662         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 663         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 664
 665         error_setg(errp, "multifd: received uuid '%s' and expected "
 666                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 667         g_free(uuid);
 668         g_free(msg_uuid);
 669         return -1;
 670     }
 671
 672     if (msg.id > migrate_multifd_channels()) {
 673         error_setg(errp, "multifd: received channel version %d "
 674                    "expected %d", msg.version, MULTIFD_VERSION);
 675         return -1;
 676     }
 677
 678     return msg.id;
 679 }
 680
 681 static MultiFDPages_t *multifd_pages_init(size_t size)
 682 {
 683     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 684
 685     pages->allocated = size;
 686     pages->iov = g_new0(struct iovec, size);
 687     pages->offset = g_new0(ram_addr_t, size);
 688
 689     return pages;
 690 }
 691
 692 static void multifd_pages_clear(MultiFDPages_t *pages)
 693 {
 694     pages->used = 0;
 695     pages->allocated = 0;
 696     pages->packet_num = 0;
 697     pages->block = NULL;
 698     g_free(pages->iov);
 699     pages->iov = NULL;
 700     g_free(pages->offset);
 701     pages->offset = NULL;
 702     g_free(pages);
 703 }
 704
 705 static void multifd_send_fill_packet(MultiFDSendParams *p)
 706 {
 707     MultiFDPacket_t *packet = p->packet;
 708     int i;
 709
 710     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 711     packet->version = cpu_to_be32(MULTIFD_VERSION);
 712     packet->flags = cpu_to_be32(p->flags);
 713     packet->size = cpu_to_be32(migrate_multifd_page_count());
 714     packet->used = cpu_to_be32(p->pages->used);
 715     packet->packet_num = cpu_to_be64(p->packet_num);
 716
 717     if (p->pages->block) {
 718         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 719     }
 720
 721     for (i = 0; i < p->pages->used; i++) {
 722         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 723     }
 724 }
 725
 726 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 727 {
 728     MultiFDPacket_t *packet = p->packet;
 729     RAMBlock *block;
 730     int i;
 731
 732     /* ToDo: We can't use it until we haven't received a message */
 733     return 0;
 734
 735     be32_to_cpus(&packet->magic);
 736     if (packet->magic != MULTIFD_MAGIC) {
 737         error_setg(errp, "multifd: received packet "
 738                    "magic %x and expected magic %x",
 739                    packet->magic, MULTIFD_MAGIC);
 740         return -1;
 741     }
 742
 743     be32_to_cpus(&packet->version);
 744     if (packet->version != MULTIFD_VERSION) {
 745         error_setg(errp, "multifd: received packet "
 746                    "version %d and expected version %d",
 747                    packet->version, MULTIFD_VERSION);
 748         return -1;
 749     }
 750
 751     p->flags = be32_to_cpu(packet->flags);
 752
 753     be32_to_cpus(&packet->size);
 754     if (packet->size > migrate_multifd_page_count()) {
 755         error_setg(errp, "multifd: received packet "
 756                    "with size %d and expected maximum size %d",
 757                    packet->size, migrate_multifd_page_count()) ;
 758         return -1;
 759     }
 760
 761     p->pages->used = be32_to_cpu(packet->used);
 762     if (p->pages->used > packet->size) {
 763         error_setg(errp, "multifd: received packet "
 764                    "with size %d and expected maximum size %d",
 765                    p->pages->used, packet->size) ;
 766         return -1;
 767     }
 768
 769     p->packet_num = be64_to_cpu(packet->packet_num);
 770
 771     if (p->pages->used) {
 772         /* make sure that ramblock is 0 terminated */
 773         packet->ramblock[255] = 0;
 774         block = qemu_ram_block_by_name(packet->ramblock);
 775         if (!block) {
 776             error_setg(errp, "multifd: unknown ram block %s",
 777                        packet->ramblock);
 778             return -1;
 779         }
 780     }
 781
 782     for (i = 0; i < p->pages->used; i++) {
 783         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 784
 785         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 786             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 787                        " (max " RAM_ADDR_FMT ")",
 788                        offset, block->max_length);
 789             return -1;
 790         }
 791         p->pages->iov[i].iov_base = block->host + offset;
 792         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 793     }
 794
 795     return 0;
 796 }
 797
 798 struct {
 799     MultiFDSendParams *params;
 800     /* number of created threads */
 801     int count;
 802     /* array of pages to sent */
 803     MultiFDPages_t *pages;
 804 } *multifd_send_state;
 805
 806 static void multifd_send_terminate_threads(Error *err)
 807 {
 808     int i;
 809
 810     if (err) {
 811         MigrationState *s = migrate_get_current();
 812         migrate_set_error(s, err);
 813         if (s->state == MIGRATION_STATUS_SETUP ||
 814             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 815             s->state == MIGRATION_STATUS_DEVICE ||
 816             s->state == MIGRATION_STATUS_ACTIVE) {
 817             migrate_set_state(&s->state, s->state,
 818                               MIGRATION_STATUS_FAILED);
 819         }
 820     }
 821
 822     for (i = 0; i < migrate_multifd_channels(); i++) {
 823         MultiFDSendParams *p = &multifd_send_state->params[i];
 824
 825         qemu_mutex_lock(&p->mutex);
 826         p->quit = true;
 827         qemu_sem_post(&p->sem);
 828         qemu_mutex_unlock(&p->mutex);
 829     }
 830 }
 831
 832 int multifd_save_cleanup(Error **errp)
 833 {
 834     int i;
 835     int ret = 0;
 836
 837     if (!migrate_use_multifd()) {
 838         return 0;
 839     }
 840     multifd_send_terminate_threads(NULL);
 841     for (i = 0; i < migrate_multifd_channels(); i++) {
 842         MultiFDSendParams *p = &multifd_send_state->params[i];
 843
 844         if (p->running) {
 845             qemu_thread_join(&p->thread);
 846         }
 847         socket_send_channel_destroy(p->c);
 848         p->c = NULL;
 849         qemu_mutex_destroy(&p->mutex);
 850         qemu_sem_destroy(&p->sem);
 851         g_free(p->name);
 852         p->name = NULL;
 853         multifd_pages_clear(p->pages);
 854         p->pages = NULL;
 855         p->packet_len = 0;
 856         g_free(p->packet);
 857         p->packet = NULL;
 858     }
 859     g_free(multifd_send_state->params);
 860     multifd_send_state->params = NULL;
 861     multifd_pages_clear(multifd_send_state->pages);
 862     multifd_send_state->pages = NULL;
 863     g_free(multifd_send_state);
 864     multifd_send_state = NULL;
 865     return ret;
 866 }
 867
 868 static void *multifd_send_thread(void *opaque)
 869 {
 870     MultiFDSendParams *p = opaque;
 871     Error *local_err = NULL;
 872
 873     trace_multifd_send_thread_start(p->id);
 874
 875     if (multifd_send_initial_packet(p, &local_err) < 0) {
 876         goto out;
 877     }
 878     /* initial packet */
 879     p->num_packets = 1;
 880
 881     while (true) {
 882         qemu_sem_wait(&p->sem);
 883         qemu_mutex_lock(&p->mutex);
 884
 885         if (p->pending_job) {
 886             uint32_t used = p->pages->used;
 887             uint64_t packet_num = p->packet_num;
 888             uint32_t flags = p->flags;
 889
 890             multifd_send_fill_packet(p);
 891             p->flags = 0;
 892             p->num_packets++;
 893             p->num_pages += used;
 894             p->pages->used = 0;
 895             qemu_mutex_unlock(&p->mutex);
 896
 897             trace_multifd_send(p->id, packet_num, used, flags);
 898
 899             /* ToDo: send packet here */
 900
 901             qemu_mutex_lock(&p->mutex);
 902             p->pending_job--;
 903             qemu_mutex_unlock(&p->mutex);
 904             continue;
 905         } else if (p->quit) {
 906             qemu_mutex_unlock(&p->mutex);
 907             break;
 908         }
 909         qemu_mutex_unlock(&p->mutex);
 910         /* this is impossible */
 911         error_setg(&local_err, "multifd_send_thread: Unknown command");
 912         break;
 913     }
 914
 915 out:
 916     if (local_err) {
 917         multifd_send_terminate_threads(local_err);
 918     }
 919
 920     qemu_mutex_lock(&p->mutex);
 921     p->running = false;
 922     qemu_mutex_unlock(&p->mutex);
 923
 924     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
 925
 926     return NULL;
 927 }
 928
 929 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
 930 {
 931     MultiFDSendParams *p = opaque;
 932     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
 933     Error *local_err = NULL;
 934
 935     if (qio_task_propagate_error(task, &local_err)) {
 936         if (multifd_save_cleanup(&local_err) != 0) {
 937             migrate_set_error(migrate_get_current(), local_err);
 938         }
 939     } else {
 940         p->c = QIO_CHANNEL(sioc);
 941         qio_channel_set_delay(p->c, false);
 942         p->running = true;
 943         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 944                            QEMU_THREAD_JOINABLE);
 945
 946         atomic_inc(&multifd_send_state->count);
 947     }
 948 }
 949
 950 int multifd_save_setup(void)
 951 {
 952     int thread_count;
 953     uint32_t page_count = migrate_multifd_page_count();
 954     uint8_t i;
 955
 956     if (!migrate_use_multifd()) {
 957         return 0;
 958     }
 959     thread_count = migrate_multifd_channels();
 960     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 961     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 962     atomic_set(&multifd_send_state->count, 0);
 963     multifd_send_state->pages = multifd_pages_init(page_count);
 964
 965     for (i = 0; i < thread_count; i++) {
 966         MultiFDSendParams *p = &multifd_send_state->params[i];
 967
 968         qemu_mutex_init(&p->mutex);
 969         qemu_sem_init(&p->sem, 0);
 970         p->quit = false;
 971         p->pending_job = 0;
 972         p->id = i;
 973         p->pages = multifd_pages_init(page_count);
 974         p->packet_len = sizeof(MultiFDPacket_t)
 975                       + sizeof(ram_addr_t) * page_count;
 976         p->packet = g_malloc0(p->packet_len);
 977         p->name = g_strdup_printf("multifdsend_%d", i);
 978         socket_send_channel_create(multifd_new_send_channel_async, p);
 979     }
 980     return 0;
 981 }
 982
 983 struct {
 984     MultiFDRecvParams *params;
 985     /* number of created threads */
 986     int count;
 987 } *multifd_recv_state;
 988
 989 static void multifd_recv_terminate_threads(Error *err)
 990 {
 991     int i;
 992
 993     if (err) {
 994         MigrationState *s = migrate_get_current();
 995         migrate_set_error(s, err);
 996         if (s->state == MIGRATION_STATUS_SETUP ||
 997             s->state == MIGRATION_STATUS_ACTIVE) {
 998             migrate_set_state(&s->state, s->state,
 999                               MIGRATION_STATUS_FAILED);
1000         }
1001     }
1002
1003     for (i = 0; i < migrate_multifd_channels(); i++) {
1004         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1005
1006         qemu_mutex_lock(&p->mutex);
1007         p->quit = true;
1008         qemu_sem_post(&p->sem);
1009         qemu_mutex_unlock(&p->mutex);
1010     }
1011 }
1012
1013 int multifd_load_cleanup(Error **errp)
1014 {
1015     int i;
1016     int ret = 0;
1017
1018     if (!migrate_use_multifd()) {
1019         return 0;
1020     }
1021     multifd_recv_terminate_threads(NULL);
1022     for (i = 0; i < migrate_multifd_channels(); i++) {
1023         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1024
1025         if (p->running) {
1026             qemu_thread_join(&p->thread);
1027         }
1028         object_unref(OBJECT(p->c));
1029         p->c = NULL;
1030         qemu_mutex_destroy(&p->mutex);
1031         qemu_sem_destroy(&p->sem);
1032         g_free(p->name);
1033         p->name = NULL;
1034         multifd_pages_clear(p->pages);
1035         p->pages = NULL;
1036         p->packet_len = 0;
1037         g_free(p->packet);
1038         p->packet = NULL;
1039     }
1040     g_free(multifd_recv_state->params);
1041     multifd_recv_state->params = NULL;
1042     g_free(multifd_recv_state);
1043     multifd_recv_state = NULL;
1044
1045     return ret;
1046 }
1047
1048 static void *multifd_recv_thread(void *opaque)
1049 {
1050     MultiFDRecvParams *p = opaque;
1051     Error *local_err = NULL;
1052     int ret;
1053
1054     trace_multifd_recv_thread_start(p->id);
1055
1056     while (true) {
1057         qemu_sem_wait(&p->sem);
1058         qemu_mutex_lock(&p->mutex);
1059         if (p->pending_job) {
1060             uint32_t used;
1061             uint32_t flags;
1062             qemu_mutex_unlock(&p->mutex);
1063
1064             /* ToDo: recv packet here */
1065
1066             qemu_mutex_lock(&p->mutex);
1067             ret = multifd_recv_unfill_packet(p, &local_err);
1068             if (ret) {
1069                 qemu_mutex_unlock(&p->mutex);
1070                 break;
1071             }
1072
1073             used = p->pages->used;
1074             flags = p->flags;
1075             trace_multifd_recv(p->id, p->packet_num, used, flags);
1076             p->pending_job = false;
1077             p->num_packets++;
1078             p->num_pages += used;
1079             qemu_mutex_unlock(&p->mutex);
1080         } else if (p->quit) {
1081             qemu_mutex_unlock(&p->mutex);
1082             break;
1083         }
1084         qemu_mutex_unlock(&p->mutex);
1085         /* this is impossible */
1086         error_setg(&local_err, "multifd_recv_thread: Unknown command");
1087         break;
1088     }
1089
1090     if (local_err) {
1091         multifd_recv_terminate_threads(local_err);
1092     }
1093     qemu_mutex_lock(&p->mutex);
1094     p->running = false;
1095     qemu_mutex_unlock(&p->mutex);
1096
1097     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1098
1099     return NULL;
1100 }
1101
1102 int multifd_load_setup(void)
1103 {
1104     int thread_count;
1105     uint32_t page_count = migrate_multifd_page_count();
1106     uint8_t i;
1107
1108     if (!migrate_use_multifd()) {
1109         return 0;
1110     }
1111     thread_count = migrate_multifd_channels();
1112     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1113     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1114     atomic_set(&multifd_recv_state->count, 0);
1115
1116     for (i = 0; i < thread_count; i++) {
1117         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1118
1119         qemu_mutex_init(&p->mutex);
1120         qemu_sem_init(&p->sem, 0);
1121         p->quit = false;
1122         p->pending_job = false;
1123         p->id = i;
1124         p->pages = multifd_pages_init(page_count);
1125         p->packet_len = sizeof(MultiFDPacket_t)
1126                       + sizeof(ram_addr_t) * page_count;
1127         p->packet = g_malloc0(p->packet_len);
1128         p->name = g_strdup_printf("multifdrecv_%d", i);
1129     }
1130     return 0;
1131 }
1132
1133 bool multifd_recv_all_channels_created(void)
1134 {
1135     int thread_count = migrate_multifd_channels();
1136
1137     if (!migrate_use_multifd()) {
1138         return true;
1139     }
1140
1141     return thread_count == atomic_read(&multifd_recv_state->count);
1142 }
1143
1144 void multifd_recv_new_channel(QIOChannel *ioc)
1145 {
1146     MultiFDRecvParams *p;
1147     Error *local_err = NULL;
1148     int id;
1149
1150     id = multifd_recv_initial_packet(ioc, &local_err);
1151     if (id < 0) {
1152         multifd_recv_terminate_threads(local_err);
1153         return;
1154     }
1155
1156     p = &multifd_recv_state->params[id];
1157     if (p->c != NULL) {
1158         error_setg(&local_err, "multifd: received id '%d' already setup'",
1159                    id);
1160         multifd_recv_terminate_threads(local_err);
1161         return;
1162     }
1163     p->c = ioc;
1164     object_ref(OBJECT(ioc));
1165     /* initial packet */
1166     p->num_packets = 1;
1167
1168     p->running = true;
1169     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1170                        QEMU_THREAD_JOINABLE);
1171     atomic_inc(&multifd_recv_state->count);
1172     if (multifd_recv_state->count == migrate_multifd_channels()) {
1173         migration_incoming_process();
1174     }
1175 }
1176
1177 /**
1178  * save_page_header: write page header to wire
1179  *
1180  * If this is the 1st block, it also writes the block identification
1181  *
1182  * Returns the number of bytes written
1183  *
1184  * @f: QEMUFile where to send the data
1185  * @block: block that contains the page we want to send
1186  * @offset: offset inside the block for the page
1187  *          in the lower bits, it contains flags
1188  */
1189 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1190                                ram_addr_t offset)
1191 {
1192     size_t size, len;
1193
1194     if (block == rs->last_sent_block) {
1195         offset |= RAM_SAVE_FLAG_CONTINUE;
1196     }
1197     qemu_put_be64(f, offset);
1198     size = 8;
1199
1200     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1201         len = strlen(block->idstr);
1202         qemu_put_byte(f, len);
1203         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1204         size += 1 + len;
1205         rs->last_sent_block = block;
1206     }
1207     return size;
1208 }
1209
1210 /**
1211  * mig_throttle_guest_down: throotle down the guest
1212  *
1213  * Reduce amount of guest cpu execution to hopefully slow down memory
1214  * writes. If guest dirty memory rate is reduced below the rate at
1215  * which we can transfer pages to the destination then we should be
1216  * able to complete migration. Some workloads dirty memory way too
1217  * fast and will not effectively converge, even with auto-converge.
1218  */
1219 static void mig_throttle_guest_down(void)
1220 {
1221     MigrationState *s = migrate_get_current();
1222     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1223     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1224
1225     /* We have not started throttling yet. Let's start it. */
1226     if (!cpu_throttle_active()) {
1227         cpu_throttle_set(pct_initial);
1228     } else {
1229         /* Throttling already on, just increase the rate */
1230         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
1231     }
1232 }
1233
1234 /**
1235  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1236  *
1237  * @rs: current RAM state
1238  * @current_addr: address for the zero page
1239  *
1240  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1241  * The important thing is that a stale (not-yet-0'd) page be replaced
1242  * by the new data.
1243  * As a bonus, if the page wasn't in the cache it gets added so that
1244  * when a small write is made into the 0'd page it gets XBZRLE sent.
1245  */
1246 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1247 {
1248     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1249         return;
1250     }
1251
1252     /* We don't care if this fails to allocate a new cache page
1253      * as long as it updated an old one */
1254     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1255                  ram_counters.dirty_sync_count);
1256 }
1257
1258 #define ENCODING_FLAG_XBZRLE 0x1
1259
1260 /**
1261  * save_xbzrle_page: compress and send current page
1262  *
1263  * Returns: 1 means that we wrote the page
1264  *          0 means that page is identical to the one already sent
1265  *          -1 means that xbzrle would be longer than normal
1266  *
1267  * @rs: current RAM state
1268  * @current_data: pointer to the address of the page contents
1269  * @current_addr: addr of the page
1270  * @block: block that contains the page we want to send
1271  * @offset: offset inside the block for the page
1272  * @last_stage: if we are at the completion stage
1273  */
1274 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1275                             ram_addr_t current_addr, RAMBlock *block,
1276                             ram_addr_t offset, bool last_stage)
1277 {
1278     int encoded_len = 0, bytes_xbzrle;
1279     uint8_t *prev_cached_page;
1280
1281     if (!cache_is_cached(XBZRLE.cache, current_addr,
1282                          ram_counters.dirty_sync_count)) {
1283         xbzrle_counters.cache_miss++;
1284         if (!last_stage) {
1285             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1286                              ram_counters.dirty_sync_count) == -1) {
1287                 return -1;
1288             } else {
1289                 /* update *current_data when the page has been
1290                    inserted into cache */
1291                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1292             }
1293         }
1294         return -1;
1295     }
1296
1297     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1298
1299     /* save current buffer into memory */
1300     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1301
1302     /* XBZRLE encoding (if there is no overflow) */
1303     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1304                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1305                                        TARGET_PAGE_SIZE);
1306     if (encoded_len == 0) {
1307         trace_save_xbzrle_page_skipping();
1308         return 0;
1309     } else if (encoded_len == -1) {
1310         trace_save_xbzrle_page_overflow();
1311         xbzrle_counters.overflow++;
1312         /* update data in the cache */
1313         if (!last_stage) {
1314             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1315             *current_data = prev_cached_page;
1316         }
1317         return -1;
1318     }
1319
1320     /* we need to update the data in the cache, in order to get the same data */
1321     if (!last_stage) {
1322         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1323     }
1324
1325     /* Send XBZRLE based compressed page */
1326     bytes_xbzrle = save_page_header(rs, rs->f, block,
1327                                     offset | RAM_SAVE_FLAG_XBZRLE);
1328     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1329     qemu_put_be16(rs->f, encoded_len);
1330     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1331     bytes_xbzrle += encoded_len + 1 + 2;
1332     xbzrle_counters.pages++;
1333     xbzrle_counters.bytes += bytes_xbzrle;
1334     ram_counters.transferred += bytes_xbzrle;
1335
1336     return 1;
1337 }
1338
1339 /**
1340  * migration_bitmap_find_dirty: find the next dirty page from start
1341  *
1342  * Called with rcu_read_lock() to protect migration_bitmap
1343  *
1344  * Returns the byte offset within memory region of the start of a dirty page
1345  *
1346  * @rs: current RAM state
1347  * @rb: RAMBlock where to search for dirty pages
1348  * @start: page where we start the search
1349  */
1350 static inline
1351 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1352                                           unsigned long start)
1353 {
1354     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1355     unsigned long *bitmap = rb->bmap;
1356     unsigned long next;
1357
1358     if (!qemu_ram_is_migratable(rb)) {
1359         return size;
1360     }
1361
1362     if (rs->ram_bulk_stage && start > 0) {
1363         next = start + 1;
1364     } else {
1365         next = find_next_bit(bitmap, size, start);
1366     }
1367
1368     return next;
1369 }
1370
1371 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1372                                                 RAMBlock *rb,
1373                                                 unsigned long page)
1374 {
1375     bool ret;
1376
1377     ret = test_and_clear_bit(page, rb->bmap);
1378
1379     if (ret) {
1380         rs->migration_dirty_pages--;
1381     }
1382     return ret;
1383 }
1384
1385 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1386                                         ram_addr_t start, ram_addr_t length)
1387 {
1388     rs->migration_dirty_pages +=
1389         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1390                                               &rs->num_dirty_pages_period);
1391 }
1392
1393 /**
1394  * ram_pagesize_summary: calculate all the pagesizes of a VM
1395  *
1396  * Returns a summary bitmap of the page sizes of all RAMBlocks
1397  *
1398  * For VMs with just normal pages this is equivalent to the host page
1399  * size. If it's got some huge pages then it's the OR of all the
1400  * different page sizes.
1401  */
1402 uint64_t ram_pagesize_summary(void)
1403 {
1404     RAMBlock *block;
1405     uint64_t summary = 0;
1406
1407     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1408         summary |= block->page_size;
1409     }
1410
1411     return summary;
1412 }
1413
1414 static void migration_update_rates(RAMState *rs, int64_t end_time)
1415 {
1416     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1417
1418     /* calculate period counters */
1419     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1420                 / (end_time - rs->time_last_bitmap_sync);
1421
1422     if (!iter_count) {
1423         return;
1424     }
1425
1426     if (migrate_use_xbzrle()) {
1427         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1428             rs->xbzrle_cache_miss_prev) / iter_count;
1429         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1430     }
1431 }
1432
1433 static void migration_bitmap_sync(RAMState *rs)
1434 {
1435     RAMBlock *block;
1436     int64_t end_time;
1437     uint64_t bytes_xfer_now;
1438
1439     ram_counters.dirty_sync_count++;
1440
1441     if (!rs->time_last_bitmap_sync) {
1442         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1443     }
1444
1445     trace_migration_bitmap_sync_start();
1446     memory_global_dirty_log_sync();
1447
1448     qemu_mutex_lock(&rs->bitmap_mutex);
1449     rcu_read_lock();
1450     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1451         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1452     }
1453     ram_counters.remaining = ram_bytes_remaining();
1454     rcu_read_unlock();
1455     qemu_mutex_unlock(&rs->bitmap_mutex);
1456
1457     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1458
1459     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1460
1461     /* more than 1 second = 1000 millisecons */
1462     if (end_time > rs->time_last_bitmap_sync + 1000) {
1463         bytes_xfer_now = ram_counters.transferred;
1464
1465         /* During block migration the auto-converge logic incorrectly detects
1466          * that ram migration makes no progress. Avoid this by disabling the
1467          * throttling logic during the bulk phase of block migration. */
1468         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1469             /* The following detection logic can be refined later. For now:
1470                Check to see if the dirtied bytes is 50% more than the approx.
1471                amount of bytes that just got transferred since the last time we
1472                were in this routine. If that happens twice, start or increase
1473                throttling */
1474
1475             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1476                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1477                 (++rs->dirty_rate_high_cnt >= 2)) {
1478                     trace_migration_throttle();
1479                     rs->dirty_rate_high_cnt = 0;
1480                     mig_throttle_guest_down();
1481             }
1482         }
1483
1484         migration_update_rates(rs, end_time);
1485
1486         rs->iterations_prev = rs->iterations;
1487
1488         /* reset period counters */
1489         rs->time_last_bitmap_sync = end_time;
1490         rs->num_dirty_pages_period = 0;
1491         rs->bytes_xfer_prev = bytes_xfer_now;
1492     }
1493     if (migrate_use_events()) {
1494         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1495     }
1496 }
1497
1498 /**
1499  * save_zero_page: send the zero page to the stream
1500  *
1501  * Returns the number of pages written.
1502  *
1503  * @rs: current RAM state
1504  * @block: block that contains the page we want to send
1505  * @offset: offset inside the block for the page
1506  */
1507 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1508 {
1509     uint8_t *p = block->host + offset;
1510     int pages = -1;
1511
1512     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1513         ram_counters.duplicate++;
1514         ram_counters.transferred +=
1515             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1516         qemu_put_byte(rs->f, 0);
1517         ram_counters.transferred += 1;
1518         pages = 1;
1519     }
1520
1521     return pages;
1522 }
1523
1524 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1525 {
1526     if (!migrate_release_ram() || !migration_in_postcopy()) {
1527         return;
1528     }
1529
1530     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1531 }
1532
1533 /*
1534  * @pages: the number of pages written by the control path,
1535  *        < 0 - error
1536  *        > 0 - number of pages written
1537  *
1538  * Return true if the pages has been saved, otherwise false is returned.
1539  */
1540 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1541                               int *pages)
1542 {
1543     uint64_t bytes_xmit = 0;
1544     int ret;
1545
1546     *pages = -1;
1547     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1548                                 &bytes_xmit);
1549     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1550         return false;
1551     }
1552
1553     if (bytes_xmit) {
1554         ram_counters.transferred += bytes_xmit;
1555         *pages = 1;
1556     }
1557
1558     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1559         return true;
1560     }
1561
1562     if (bytes_xmit > 0) {
1563         ram_counters.normal++;
1564     } else if (bytes_xmit == 0) {
1565         ram_counters.duplicate++;
1566     }
1567
1568     return true;
1569 }
1570
1571 /*
1572  * directly send the page to the stream
1573  *
1574  * Returns the number of pages written.
1575  *
1576  * @rs: current RAM state
1577  * @block: block that contains the page we want to send
1578  * @offset: offset inside the block for the page
1579  * @buf: the page to be sent
1580  * @async: send to page asyncly
1581  */
1582 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1583                             uint8_t *buf, bool async)
1584 {
1585     ram_counters.transferred += save_page_header(rs, rs->f, block,
1586                                                  offset | RAM_SAVE_FLAG_PAGE);
1587     if (async) {
1588         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1589                               migrate_release_ram() &
1590                               migration_in_postcopy());
1591     } else {
1592         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1593     }
1594     ram_counters.transferred += TARGET_PAGE_SIZE;
1595     ram_counters.normal++;
1596     return 1;
1597 }
1598
1599 /**
1600  * ram_save_page: send the given page to the stream
1601  *
1602  * Returns the number of pages written.
1603  *          < 0 - error
1604  *          >=0 - Number of pages written - this might legally be 0
1605  *                if xbzrle noticed the page was the same.
1606  *
1607  * @rs: current RAM state
1608  * @block: block that contains the page we want to send
1609  * @offset: offset inside the block for the page
1610  * @last_stage: if we are at the completion stage
1611  */
1612 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1613 {
1614     int pages = -1;
1615     uint8_t *p;
1616     bool send_async = true;
1617     RAMBlock *block = pss->block;
1618     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1619     ram_addr_t current_addr = block->offset + offset;
1620
1621     p = block->host + offset;
1622     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1623
1624     XBZRLE_cache_lock();
1625     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1626         migrate_use_xbzrle()) {
1627         pages = save_xbzrle_page(rs, &p, current_addr, block,
1628                                  offset, last_stage);
1629         if (!last_stage) {
1630             /* Can't send this cached data async, since the cache page
1631              * might get updated before it gets to the wire
1632              */
1633             send_async = false;
1634         }
1635     }
1636
1637     /* XBZRLE overflow or normal page */
1638     if (pages == -1) {
1639         pages = save_normal_page(rs, block, offset, p, send_async);
1640     }
1641
1642     XBZRLE_cache_unlock();
1643
1644     return pages;
1645 }
1646
1647 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1648                                 ram_addr_t offset, uint8_t *source_buf)
1649 {
1650     RAMState *rs = ram_state;
1651     int bytes_sent, blen;
1652     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1653
1654     bytes_sent = save_page_header(rs, f, block, offset |
1655                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1656
1657     /*
1658      * copy it to a internal buffer to avoid it being modified by VM
1659      * so that we can catch up the error during compression and
1660      * decompression
1661      */
1662     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1663     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1664     if (blen < 0) {
1665         bytes_sent = 0;
1666         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1667         error_report("compressed data failed!");
1668     } else {
1669         bytes_sent += blen;
1670         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1671     }
1672
1673     return bytes_sent;
1674 }
1675
1676 static void flush_compressed_data(RAMState *rs)
1677 {
1678     int idx, len, thread_count;
1679
1680     if (!migrate_use_compression()) {
1681         return;
1682     }
1683     thread_count = migrate_compress_threads();
1684
1685     qemu_mutex_lock(&comp_done_lock);
1686     for (idx = 0; idx < thread_count; idx++) {
1687         while (!comp_param[idx].done) {
1688             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1689         }
1690     }
1691     qemu_mutex_unlock(&comp_done_lock);
1692
1693     for (idx = 0; idx < thread_count; idx++) {
1694         qemu_mutex_lock(&comp_param[idx].mutex);
1695         if (!comp_param[idx].quit) {
1696             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1697             ram_counters.transferred += len;
1698         }
1699         qemu_mutex_unlock(&comp_param[idx].mutex);
1700     }
1701 }
1702
1703 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1704                                        ram_addr_t offset)
1705 {
1706     param->block = block;
1707     param->offset = offset;
1708 }
1709
1710 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1711                                            ram_addr_t offset)
1712 {
1713     int idx, thread_count, bytes_xmit = -1, pages = -1;
1714
1715     thread_count = migrate_compress_threads();
1716     qemu_mutex_lock(&comp_done_lock);
1717     while (true) {
1718         for (idx = 0; idx < thread_count; idx++) {
1719             if (comp_param[idx].done) {
1720                 comp_param[idx].done = false;
1721                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1722                 qemu_mutex_lock(&comp_param[idx].mutex);
1723                 set_compress_params(&comp_param[idx], block, offset);
1724                 qemu_cond_signal(&comp_param[idx].cond);
1725                 qemu_mutex_unlock(&comp_param[idx].mutex);
1726                 pages = 1;
1727                 ram_counters.normal++;
1728                 ram_counters.transferred += bytes_xmit;
1729                 break;
1730             }
1731         }
1732         if (pages > 0) {
1733             break;
1734         } else {
1735             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1736         }
1737     }
1738     qemu_mutex_unlock(&comp_done_lock);
1739
1740     return pages;
1741 }
1742
1743 /**
1744  * find_dirty_block: find the next dirty page and update any state
1745  * associated with the search process.
1746  *
1747  * Returns if a page is found
1748  *
1749  * @rs: current RAM state
1750  * @pss: data about the state of the current dirty page scan
1751  * @again: set to false if the search has scanned the whole of RAM
1752  */
1753 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1754 {
1755     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1756     if (pss->complete_round && pss->block == rs->last_seen_block &&
1757         pss->page >= rs->last_page) {
1758         /*
1759          * We've been once around the RAM and haven't found anything.
1760          * Give up.
1761          */
1762         *again = false;
1763         return false;
1764     }
1765     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1766         /* Didn't find anything in this RAM Block */
1767         pss->page = 0;
1768         pss->block = QLIST_NEXT_RCU(pss->block, next);
1769         if (!pss->block) {
1770             /* Hit the end of the list */
1771             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1772             /* Flag that we've looped */
1773             pss->complete_round = true;
1774             rs->ram_bulk_stage = false;
1775             if (migrate_use_xbzrle()) {
1776                 /* If xbzrle is on, stop using the data compression at this
1777                  * point. In theory, xbzrle can do better than compression.
1778                  */
1779                 flush_compressed_data(rs);
1780             }
1781         }
1782         /* Didn't find anything this time, but try again on the new block */
1783         *again = true;
1784         return false;
1785     } else {
1786         /* Can go around again, but... */
1787         *again = true;
1788         /* We've found something so probably don't need to */
1789         return true;
1790     }
1791 }
1792
1793 /**
1794  * unqueue_page: gets a page of the queue
1795  *
1796  * Helper for 'get_queued_page' - gets a page off the queue
1797  *
1798  * Returns the block of the page (or NULL if none available)
1799  *
1800  * @rs: current RAM state
1801  * @offset: used to return the offset within the RAMBlock
1802  */
1803 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1804 {
1805     RAMBlock *block = NULL;
1806
1807     qemu_mutex_lock(&rs->src_page_req_mutex);
1808     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1809         struct RAMSrcPageRequest *entry =
1810                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1811         block = entry->rb;
1812         *offset = entry->offset;
1813
1814         if (entry->len > TARGET_PAGE_SIZE) {
1815             entry->len -= TARGET_PAGE_SIZE;
1816             entry->offset += TARGET_PAGE_SIZE;
1817         } else {
1818             memory_region_unref(block->mr);
1819             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1820             g_free(entry);
1821             migration_consume_urgent_request();
1822         }
1823     }
1824     qemu_mutex_unlock(&rs->src_page_req_mutex);
1825
1826     return block;
1827 }
1828
1829 /**
1830  * get_queued_page: unqueue a page from the postocpy requests
1831  *
1832  * Skips pages that are already sent (!dirty)
1833  *
1834  * Returns if a queued page is found
1835  *
1836  * @rs: current RAM state
1837  * @pss: data about the state of the current dirty page scan
1838  */
1839 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1840 {
1841     RAMBlock  *block;
1842     ram_addr_t offset;
1843     bool dirty;
1844
1845     do {
1846         block = unqueue_page(rs, &offset);
1847         /*
1848          * We're sending this page, and since it's postcopy nothing else
1849          * will dirty it, and we must make sure it doesn't get sent again
1850          * even if this queue request was received after the background
1851          * search already sent it.
1852          */
1853         if (block) {
1854             unsigned long page;
1855
1856             page = offset >> TARGET_PAGE_BITS;
1857             dirty = test_bit(page, block->bmap);
1858             if (!dirty) {
1859                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1860                        page, test_bit(page, block->unsentmap));
1861             } else {
1862                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1863             }
1864         }
1865
1866     } while (block && !dirty);
1867
1868     if (block) {
1869         /*
1870          * As soon as we start servicing pages out of order, then we have
1871          * to kill the bulk stage, since the bulk stage assumes
1872          * in (migration_bitmap_find_and_reset_dirty) that every page is
1873          * dirty, that's no longer true.
1874          */
1875         rs->ram_bulk_stage = false;
1876
1877         /*
1878          * We want the background search to continue from the queued page
1879          * since the guest is likely to want other pages near to the page
1880          * it just requested.
1881          */
1882         pss->block = block;
1883         pss->page = offset >> TARGET_PAGE_BITS;
1884     }
1885
1886     return !!block;
1887 }
1888
1889 /**
1890  * migration_page_queue_free: drop any remaining pages in the ram
1891  * request queue
1892  *
1893  * It should be empty at the end anyway, but in error cases there may
1894  * be some left.  in case that there is any page left, we drop it.
1895  *
1896  */
1897 static void migration_page_queue_free(RAMState *rs)
1898 {
1899     struct RAMSrcPageRequest *mspr, *next_mspr;
1900     /* This queue generally should be empty - but in the case of a failed
1901      * migration might have some droppings in.
1902      */
1903     rcu_read_lock();
1904     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1905         memory_region_unref(mspr->rb->mr);
1906         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1907         g_free(mspr);
1908     }
1909     rcu_read_unlock();
1910 }
1911
1912 /**
1913  * ram_save_queue_pages: queue the page for transmission
1914  *
1915  * A request from postcopy destination for example.
1916  *
1917  * Returns zero on success or negative on error
1918  *
1919  * @rbname: Name of the RAMBLock of the request. NULL means the
1920  *          same that last one.
1921  * @start: starting address from the start of the RAMBlock
1922  * @len: length (in bytes) to send
1923  */
1924 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1925 {
1926     RAMBlock *ramblock;
1927     RAMState *rs = ram_state;
1928
1929     ram_counters.postcopy_requests++;
1930     rcu_read_lock();
1931     if (!rbname) {
1932         /* Reuse last RAMBlock */
1933         ramblock = rs->last_req_rb;
1934
1935         if (!ramblock) {
1936             /*
1937              * Shouldn't happen, we can't reuse the last RAMBlock if
1938              * it's the 1st request.
1939              */
1940             error_report("ram_save_queue_pages no previous block");
1941             goto err;
1942         }
1943     } else {
1944         ramblock = qemu_ram_block_by_name(rbname);
1945
1946         if (!ramblock) {
1947             /* We shouldn't be asked for a non-existent RAMBlock */
1948             error_report("ram_save_queue_pages no block '%s'", rbname);
1949             goto err;
1950         }
1951         rs->last_req_rb = ramblock;
1952     }
1953     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1954     if (start+len > ramblock->used_length) {
1955         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1956                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1957                      __func__, start, len, ramblock->used_length);
1958         goto err;
1959     }
1960
1961     struct RAMSrcPageRequest *new_entry =
1962         g_malloc0(sizeof(struct RAMSrcPageRequest));
1963     new_entry->rb = ramblock;
1964     new_entry->offset = start;
1965     new_entry->len = len;
1966
1967     memory_region_ref(ramblock->mr);
1968     qemu_mutex_lock(&rs->src_page_req_mutex);
1969     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1970     migration_make_urgent_request();
1971     qemu_mutex_unlock(&rs->src_page_req_mutex);
1972     rcu_read_unlock();
1973
1974     return 0;
1975
1976 err:
1977     rcu_read_unlock();
1978     return -1;
1979 }
1980
1981 static bool save_page_use_compression(RAMState *rs)
1982 {
1983     if (!migrate_use_compression()) {
1984         return false;
1985     }
1986
1987     /*
1988      * If xbzrle is on, stop using the data compression after first
1989      * round of migration even if compression is enabled. In theory,
1990      * xbzrle can do better than compression.
1991      */
1992     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1993         return true;
1994     }
1995
1996     return false;
1997 }
1998
1999 /**
2000  * ram_save_target_page: save one target page
2001  *
2002  * Returns the number of pages written
2003  *
2004  * @rs: current RAM state
2005  * @pss: data about the page we want to send
2006  * @last_stage: if we are at the completion stage
2007  */
2008 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2009                                 bool last_stage)
2010 {
2011     RAMBlock *block = pss->block;
2012     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2013     int res;
2014
2015     if (control_save_page(rs, block, offset, &res)) {
2016         return res;
2017     }
2018
2019     /*
2020      * When starting the process of a new block, the first page of
2021      * the block should be sent out before other pages in the same
2022      * block, and all the pages in last block should have been sent
2023      * out, keeping this order is important, because the 'cont' flag
2024      * is used to avoid resending the block name.
2025      */
2026     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2027             flush_compressed_data(rs);
2028     }
2029
2030     res = save_zero_page(rs, block, offset);
2031     if (res > 0) {
2032         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2033          * page would be stale
2034          */
2035         if (!save_page_use_compression(rs)) {
2036             XBZRLE_cache_lock();
2037             xbzrle_cache_zero_page(rs, block->offset + offset);
2038             XBZRLE_cache_unlock();
2039         }
2040         ram_release_pages(block->idstr, offset, res);
2041         return res;
2042     }
2043
2044     /*
2045      * Make sure the first page is sent out before other pages.
2046      *
2047      * we post it as normal page as compression will take much
2048      * CPU resource.
2049      */
2050     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2051         return compress_page_with_multi_thread(rs, block, offset);
2052     }
2053
2054     return ram_save_page(rs, pss, last_stage);
2055 }
2056
2057 /**
2058  * ram_save_host_page: save a whole host page
2059  *
2060  * Starting at *offset send pages up to the end of the current host
2061  * page. It's valid for the initial offset to point into the middle of
2062  * a host page in which case the remainder of the hostpage is sent.
2063  * Only dirty target pages are sent. Note that the host page size may
2064  * be a huge page for this block.
2065  * The saving stops at the boundary of the used_length of the block
2066  * if the RAMBlock isn't a multiple of the host page size.
2067  *
2068  * Returns the number of pages written or negative on error
2069  *
2070  * @rs: current RAM state
2071  * @ms: current migration state
2072  * @pss: data about the page we want to send
2073  * @last_stage: if we are at the completion stage
2074  */
2075 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2076                               bool last_stage)
2077 {
2078     int tmppages, pages = 0;
2079     size_t pagesize_bits =
2080         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2081
2082     if (!qemu_ram_is_migratable(pss->block)) {
2083         error_report("block %s should not be migrated !", pss->block->idstr);
2084         return 0;
2085     }
2086
2087     do {
2088         /* Check the pages is dirty and if it is send it */
2089         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2090             pss->page++;
2091             continue;
2092         }
2093
2094         tmppages = ram_save_target_page(rs, pss, last_stage);
2095         if (tmppages < 0) {
2096             return tmppages;
2097         }
2098
2099         pages += tmppages;
2100         if (pss->block->unsentmap) {
2101             clear_bit(pss->page, pss->block->unsentmap);
2102         }
2103
2104         pss->page++;
2105     } while ((pss->page & (pagesize_bits - 1)) &&
2106              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2107
2108     /* The offset we leave with is the last one we looked at */
2109     pss->page--;
2110     return pages;
2111 }
2112
2113 /**
2114  * ram_find_and_save_block: finds a dirty page and sends it to f
2115  *
2116  * Called within an RCU critical section.
2117  *
2118  * Returns the number of pages written where zero means no dirty pages
2119  *
2120  * @rs: current RAM state
2121  * @last_stage: if we are at the completion stage
2122  *
2123  * On systems where host-page-size > target-page-size it will send all the
2124  * pages in a host page that are dirty.
2125  */
2126
2127 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2128 {
2129     PageSearchStatus pss;
2130     int pages = 0;
2131     bool again, found;
2132
2133     /* No dirty page as there is zero RAM */
2134     if (!ram_bytes_total()) {
2135         return pages;
2136     }
2137
2138     pss.block = rs->last_seen_block;
2139     pss.page = rs->last_page;
2140     pss.complete_round = false;
2141
2142     if (!pss.block) {
2143         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2144     }
2145
2146     do {
2147         again = true;
2148         found = get_queued_page(rs, &pss);
2149
2150         if (!found) {
2151             /* priority queue empty, so just search for something dirty */
2152             found = find_dirty_block(rs, &pss, &again);
2153         }
2154
2155         if (found) {
2156             pages = ram_save_host_page(rs, &pss, last_stage);
2157         }
2158     } while (!pages && again);
2159
2160     rs->last_seen_block = pss.block;
2161     rs->last_page = pss.page;
2162
2163     return pages;
2164 }
2165
2166 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2167 {
2168     uint64_t pages = size / TARGET_PAGE_SIZE;
2169
2170     if (zero) {
2171         ram_counters.duplicate += pages;
2172     } else {
2173         ram_counters.normal += pages;
2174         ram_counters.transferred += size;
2175         qemu_update_position(f, size);
2176     }
2177 }
2178
2179 uint64_t ram_bytes_total(void)
2180 {
2181     RAMBlock *block;
2182     uint64_t total = 0;
2183
2184     rcu_read_lock();
2185     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2186         total += block->used_length;
2187     }
2188     rcu_read_unlock();
2189     return total;
2190 }
2191
2192 static void xbzrle_load_setup(void)
2193 {
2194     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2195 }
2196
2197 static void xbzrle_load_cleanup(void)
2198 {
2199     g_free(XBZRLE.decoded_buf);
2200     XBZRLE.decoded_buf = NULL;
2201 }
2202
2203 static void ram_state_cleanup(RAMState **rsp)
2204 {
2205     if (*rsp) {
2206         migration_page_queue_free(*rsp);
2207         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2208         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2209         g_free(*rsp);
2210         *rsp = NULL;
2211     }
2212 }
2213
2214 static void xbzrle_cleanup(void)
2215 {
2216     XBZRLE_cache_lock();
2217     if (XBZRLE.cache) {
2218         cache_fini(XBZRLE.cache);
2219         g_free(XBZRLE.encoded_buf);
2220         g_free(XBZRLE.current_buf);
2221         g_free(XBZRLE.zero_target_page);
2222         XBZRLE.cache = NULL;
2223         XBZRLE.encoded_buf = NULL;
2224         XBZRLE.current_buf = NULL;
2225         XBZRLE.zero_target_page = NULL;
2226     }
2227     XBZRLE_cache_unlock();
2228 }
2229
2230 static void ram_save_cleanup(void *opaque)
2231 {
2232     RAMState **rsp = opaque;
2233     RAMBlock *block;
2234
2235     /* caller have hold iothread lock or is in a bh, so there is
2236      * no writing race against this migration_bitmap
2237      */
2238     memory_global_dirty_log_stop();
2239
2240     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2241         g_free(block->bmap);
2242         block->bmap = NULL;
2243         g_free(block->unsentmap);
2244         block->unsentmap = NULL;
2245     }
2246
2247     xbzrle_cleanup();
2248     compress_threads_save_cleanup();
2249     ram_state_cleanup(rsp);
2250 }
2251
2252 static void ram_state_reset(RAMState *rs)
2253 {
2254     rs->last_seen_block = NULL;
2255     rs->last_sent_block = NULL;
2256     rs->last_page = 0;
2257     rs->last_version = ram_list.version;
2258     rs->ram_bulk_stage = true;
2259 }
2260
2261 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2262
2263 /*
2264  * 'expected' is the value you expect the bitmap mostly to be full
2265  * of; it won't bother printing lines that are all this value.
2266  * If 'todump' is null the migration bitmap is dumped.
2267  */
2268 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2269                            unsigned long pages)
2270 {
2271     int64_t cur;
2272     int64_t linelen = 128;
2273     char linebuf[129];
2274
2275     for (cur = 0; cur < pages; cur += linelen) {
2276         int64_t curb;
2277         bool found = false;
2278         /*
2279          * Last line; catch the case where the line length
2280          * is longer than remaining ram
2281          */
2282         if (cur + linelen > pages) {
2283             linelen = pages - cur;
2284         }
2285         for (curb = 0; curb < linelen; curb++) {
2286             bool thisbit = test_bit(cur + curb, todump);
2287             linebuf[curb] = thisbit ? '1' : '.';
2288             found = found || (thisbit != expected);
2289         }
2290         if (found) {
2291             linebuf[curb] = '\0';
2292             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2293         }
2294     }
2295 }
2296
2297 /* **** functions for postcopy ***** */
2298
2299 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2300 {
2301     struct RAMBlock *block;
2302
2303     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2304         unsigned long *bitmap = block->bmap;
2305         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2306         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2307
2308         while (run_start < range) {
2309             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2310             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2311                               (run_end - run_start) << TARGET_PAGE_BITS);
2312             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2313         }
2314     }
2315 }
2316
2317 /**
2318  * postcopy_send_discard_bm_ram: discard a RAMBlock
2319  *
2320  * Returns zero on success
2321  *
2322  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2323  * Note: At this point the 'unsentmap' is the processed bitmap combined
2324  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2325  *
2326  * @ms: current migration state
2327  * @pds: state for postcopy
2328  * @start: RAMBlock starting page
2329  * @length: RAMBlock size
2330  */
2331 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2332                                         PostcopyDiscardState *pds,
2333                                         RAMBlock *block)
2334 {
2335     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2336     unsigned long current;
2337     unsigned long *unsentmap = block->unsentmap;
2338
2339     for (current = 0; current < end; ) {
2340         unsigned long one = find_next_bit(unsentmap, end, current);
2341
2342         if (one <= end) {
2343             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2344             unsigned long discard_length;
2345
2346             if (zero >= end) {
2347                 discard_length = end - one;
2348             } else {
2349                 discard_length = zero - one;
2350             }
2351             if (discard_length) {
2352                 postcopy_discard_send_range(ms, pds, one, discard_length);
2353             }
2354             current = one + discard_length;
2355         } else {
2356             current = one;
2357         }
2358     }
2359
2360     return 0;
2361 }
2362
2363 /**
2364  * postcopy_each_ram_send_discard: discard all RAMBlocks
2365  *
2366  * Returns 0 for success or negative for error
2367  *
2368  * Utility for the outgoing postcopy code.
2369  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2370  *   passing it bitmap indexes and name.
2371  * (qemu_ram_foreach_block ends up passing unscaled lengths
2372  *  which would mean postcopy code would have to deal with target page)
2373  *
2374  * @ms: current migration state
2375  */
2376 static int postcopy_each_ram_send_discard(MigrationState *ms)
2377 {
2378     struct RAMBlock *block;
2379     int ret;
2380
2381     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2382         PostcopyDiscardState *pds =
2383             postcopy_discard_send_init(ms, block->idstr);
2384
2385         /*
2386          * Postcopy sends chunks of bitmap over the wire, but it
2387          * just needs indexes at this point, avoids it having
2388          * target page specific code.
2389          */
2390         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2391         postcopy_discard_send_finish(ms, pds);
2392         if (ret) {
2393             return ret;
2394         }
2395     }
2396
2397     return 0;
2398 }
2399
2400 /**
2401  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2402  *
2403  * Helper for postcopy_chunk_hostpages; it's called twice to
2404  * canonicalize the two bitmaps, that are similar, but one is
2405  * inverted.
2406  *
2407  * Postcopy requires that all target pages in a hostpage are dirty or
2408  * clean, not a mix.  This function canonicalizes the bitmaps.
2409  *
2410  * @ms: current migration state
2411  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2412  *               otherwise we need to canonicalize partially dirty host pages
2413  * @block: block that contains the page we want to canonicalize
2414  * @pds: state for postcopy
2415  */
2416 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2417                                           RAMBlock *block,
2418                                           PostcopyDiscardState *pds)
2419 {
2420     RAMState *rs = ram_state;
2421     unsigned long *bitmap = block->bmap;
2422     unsigned long *unsentmap = block->unsentmap;
2423     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2424     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2425     unsigned long run_start;
2426
2427     if (block->page_size == TARGET_PAGE_SIZE) {
2428         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2429         return;
2430     }
2431
2432     if (unsent_pass) {
2433         /* Find a sent page */
2434         run_start = find_next_zero_bit(unsentmap, pages, 0);
2435     } else {
2436         /* Find a dirty page */
2437         run_start = find_next_bit(bitmap, pages, 0);
2438     }
2439
2440     while (run_start < pages) {
2441         bool do_fixup = false;
2442         unsigned long fixup_start_addr;
2443         unsigned long host_offset;
2444
2445         /*
2446          * If the start of this run of pages is in the middle of a host
2447          * page, then we need to fixup this host page.
2448          */
2449         host_offset = run_start % host_ratio;
2450         if (host_offset) {
2451             do_fixup = true;
2452             run_start -= host_offset;
2453             fixup_start_addr = run_start;
2454             /* For the next pass */
2455             run_start = run_start + host_ratio;
2456         } else {
2457             /* Find the end of this run */
2458             unsigned long run_end;
2459             if (unsent_pass) {
2460                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2461             } else {
2462                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2463             }
2464             /*
2465              * If the end isn't at the start of a host page, then the
2466              * run doesn't finish at the end of a host page
2467              * and we need to discard.
2468              */
2469             host_offset = run_end % host_ratio;
2470             if (host_offset) {
2471                 do_fixup = true;
2472                 fixup_start_addr = run_end - host_offset;
2473                 /*
2474                  * This host page has gone, the next loop iteration starts
2475                  * from after the fixup
2476                  */
2477                 run_start = fixup_start_addr + host_ratio;
2478             } else {
2479                 /*
2480                  * No discards on this iteration, next loop starts from
2481                  * next sent/dirty page
2482                  */
2483                 run_start = run_end + 1;
2484             }
2485         }
2486
2487         if (do_fixup) {
2488             unsigned long page;
2489
2490             /* Tell the destination to discard this page */
2491             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2492                 /* For the unsent_pass we:
2493                  *     discard partially sent pages
2494                  * For the !unsent_pass (dirty) we:
2495                  *     discard partially dirty pages that were sent
2496                  *     (any partially sent pages were already discarded
2497                  *     by the previous unsent_pass)
2498                  */
2499                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2500                                             host_ratio);
2501             }
2502
2503             /* Clean up the bitmap */
2504             for (page = fixup_start_addr;
2505                  page < fixup_start_addr + host_ratio; page++) {
2506                 /* All pages in this host page are now not sent */
2507                 set_bit(page, unsentmap);
2508
2509                 /*
2510                  * Remark them as dirty, updating the count for any pages
2511                  * that weren't previously dirty.
2512                  */
2513                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2514             }
2515         }
2516
2517         if (unsent_pass) {
2518             /* Find the next sent page for the next iteration */
2519             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2520         } else {
2521             /* Find the next dirty page for the next iteration */
2522             run_start = find_next_bit(bitmap, pages, run_start);
2523         }
2524     }
2525 }
2526
2527 /**
2528  * postcopy_chuck_hostpages: discrad any partially sent host page
2529  *
2530  * Utility for the outgoing postcopy code.
2531  *
2532  * Discard any partially sent host-page size chunks, mark any partially
2533  * dirty host-page size chunks as all dirty.  In this case the host-page
2534  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2535  *
2536  * Returns zero on success
2537  *
2538  * @ms: current migration state
2539  * @block: block we want to work with
2540  */
2541 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2542 {
2543     PostcopyDiscardState *pds =
2544         postcopy_discard_send_init(ms, block->idstr);
2545
2546     /* First pass: Discard all partially sent host pages */
2547     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2548     /*
2549      * Second pass: Ensure that all partially dirty host pages are made
2550      * fully dirty.
2551      */
2552     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2553
2554     postcopy_discard_send_finish(ms, pds);
2555     return 0;
2556 }
2557
2558 /**
2559  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2560  *
2561  * Returns zero on success
2562  *
2563  * Transmit the set of pages to be discarded after precopy to the target
2564  * these are pages that:
2565  *     a) Have been previously transmitted but are now dirty again
2566  *     b) Pages that have never been transmitted, this ensures that
2567  *        any pages on the destination that have been mapped by background
2568  *        tasks get discarded (transparent huge pages is the specific concern)
2569  * Hopefully this is pretty sparse
2570  *
2571  * @ms: current migration state
2572  */
2573 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2574 {
2575     RAMState *rs = ram_state;
2576     RAMBlock *block;
2577     int ret;
2578
2579     rcu_read_lock();
2580
2581     /* This should be our last sync, the src is now paused */
2582     migration_bitmap_sync(rs);
2583
2584     /* Easiest way to make sure we don't resume in the middle of a host-page */
2585     rs->last_seen_block = NULL;
2586     rs->last_sent_block = NULL;
2587     rs->last_page = 0;
2588
2589     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2590         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2591         unsigned long *bitmap = block->bmap;
2592         unsigned long *unsentmap = block->unsentmap;
2593
2594         if (!unsentmap) {
2595             /* We don't have a safe way to resize the sentmap, so
2596              * if the bitmap was resized it will be NULL at this
2597              * point.
2598              */
2599             error_report("migration ram resized during precopy phase");
2600             rcu_read_unlock();
2601             return -EINVAL;
2602         }
2603         /* Deal with TPS != HPS and huge pages */
2604         ret = postcopy_chunk_hostpages(ms, block);
2605         if (ret) {
2606             rcu_read_unlock();
2607             return ret;
2608         }
2609
2610         /*
2611          * Update the unsentmap to be unsentmap = unsentmap | dirty
2612          */
2613         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2614 #ifdef DEBUG_POSTCOPY
2615         ram_debug_dump_bitmap(unsentmap, true, pages);
2616 #endif
2617     }
2618     trace_ram_postcopy_send_discard_bitmap();
2619
2620     ret = postcopy_each_ram_send_discard(ms);
2621     rcu_read_unlock();
2622
2623     return ret;
2624 }
2625
2626 /**
2627  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2628  *
2629  * Returns zero on success
2630  *
2631  * @rbname: name of the RAMBlock of the request. NULL means the
2632  *          same that last one.
2633  * @start: RAMBlock starting page
2634  * @length: RAMBlock size
2635  */
2636 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2637 {
2638     int ret = -1;
2639
2640     trace_ram_discard_range(rbname, start, length);
2641
2642     rcu_read_lock();
2643     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2644
2645     if (!rb) {
2646         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2647         goto err;
2648     }
2649
2650     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2651                  length >> qemu_target_page_bits());
2652     ret = ram_block_discard_range(rb, start, length);
2653
2654 err:
2655     rcu_read_unlock();
2656
2657     return ret;
2658 }
2659
2660 /*
2661  * For every allocation, we will try not to crash the VM if the
2662  * allocation failed.
2663  */
2664 static int xbzrle_init(void)
2665 {
2666     Error *local_err = NULL;
2667
2668     if (!migrate_use_xbzrle()) {
2669         return 0;
2670     }
2671
2672     XBZRLE_cache_lock();
2673
2674     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2675     if (!XBZRLE.zero_target_page) {
2676         error_report("%s: Error allocating zero page", __func__);
2677         goto err_out;
2678     }
2679
2680     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2681                               TARGET_PAGE_SIZE, &local_err);
2682     if (!XBZRLE.cache) {
2683         error_report_err(local_err);
2684         goto free_zero_page;
2685     }
2686
2687     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2688     if (!XBZRLE.encoded_buf) {
2689         error_report("%s: Error allocating encoded_buf", __func__);
2690         goto free_cache;
2691     }
2692
2693     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2694     if (!XBZRLE.current_buf) {
2695         error_report("%s: Error allocating current_buf", __func__);
2696         goto free_encoded_buf;
2697     }
2698
2699     /* We are all good */
2700     XBZRLE_cache_unlock();
2701     return 0;
2702
2703 free_encoded_buf:
2704     g_free(XBZRLE.encoded_buf);
2705     XBZRLE.encoded_buf = NULL;
2706 free_cache:
2707     cache_fini(XBZRLE.cache);
2708     XBZRLE.cache = NULL;
2709 free_zero_page:
2710     g_free(XBZRLE.zero_target_page);
2711     XBZRLE.zero_target_page = NULL;
2712 err_out:
2713     XBZRLE_cache_unlock();
2714     return -ENOMEM;
2715 }
2716
2717 static int ram_state_init(RAMState **rsp)
2718 {
2719     *rsp = g_try_new0(RAMState, 1);
2720
2721     if (!*rsp) {
2722         error_report("%s: Init ramstate fail", __func__);
2723         return -1;
2724     }
2725
2726     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2727     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2728     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2729
2730     /*
2731      * Count the total number of pages used by ram blocks not including any
2732      * gaps due to alignment or unplugs.
2733      */
2734     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2735
2736     ram_state_reset(*rsp);
2737
2738     return 0;
2739 }
2740
2741 static void ram_list_init_bitmaps(void)
2742 {
2743     RAMBlock *block;
2744     unsigned long pages;
2745
2746     /* Skip setting bitmap if there is no RAM */
2747     if (ram_bytes_total()) {
2748         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2749             pages = block->max_length >> TARGET_PAGE_BITS;
2750             block->bmap = bitmap_new(pages);
2751             bitmap_set(block->bmap, 0, pages);
2752             if (migrate_postcopy_ram()) {
2753                 block->unsentmap = bitmap_new(pages);
2754                 bitmap_set(block->unsentmap, 0, pages);
2755             }
2756         }
2757     }
2758 }
2759
2760 static void ram_init_bitmaps(RAMState *rs)
2761 {
2762     /* For memory_global_dirty_log_start below.  */
2763     qemu_mutex_lock_iothread();
2764     qemu_mutex_lock_ramlist();
2765     rcu_read_lock();
2766
2767     ram_list_init_bitmaps();
2768     memory_global_dirty_log_start();
2769     migration_bitmap_sync(rs);
2770
2771     rcu_read_unlock();
2772     qemu_mutex_unlock_ramlist();
2773     qemu_mutex_unlock_iothread();
2774 }
2775
2776 static int ram_init_all(RAMState **rsp)
2777 {
2778     if (ram_state_init(rsp)) {
2779         return -1;
2780     }
2781
2782     if (xbzrle_init()) {
2783         ram_state_cleanup(rsp);
2784         return -1;
2785     }
2786
2787     ram_init_bitmaps(*rsp);
2788
2789     return 0;
2790 }
2791
2792 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2793 {
2794     RAMBlock *block;
2795     uint64_t pages = 0;
2796
2797     /*
2798      * Postcopy is not using xbzrle/compression, so no need for that.
2799      * Also, since source are already halted, we don't need to care
2800      * about dirty page logging as well.
2801      */
2802
2803     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2804         pages += bitmap_count_one(block->bmap,
2805                                   block->used_length >> TARGET_PAGE_BITS);
2806     }
2807
2808     /* This may not be aligned with current bitmaps. Recalculate. */
2809     rs->migration_dirty_pages = pages;
2810
2811     rs->last_seen_block = NULL;
2812     rs->last_sent_block = NULL;
2813     rs->last_page = 0;
2814     rs->last_version = ram_list.version;
2815     /*
2816      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2817      * matter what we have sent.
2818      */
2819     rs->ram_bulk_stage = false;
2820
2821     /* Update RAMState cache of output QEMUFile */
2822     rs->f = out;
2823
2824     trace_ram_state_resume_prepare(pages);
2825 }
2826
2827 /*
2828  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2829  * long-running RCU critical section.  When rcu-reclaims in the code
2830  * start to become numerous it will be necessary to reduce the
2831  * granularity of these critical sections.
2832  */
2833
2834 /**
2835  * ram_save_setup: Setup RAM for migration
2836  *
2837  * Returns zero to indicate success and negative for error
2838  *
2839  * @f: QEMUFile where to send the data
2840  * @opaque: RAMState pointer
2841  */
2842 static int ram_save_setup(QEMUFile *f, void *opaque)
2843 {
2844     RAMState **rsp = opaque;
2845     RAMBlock *block;
2846
2847     if (compress_threads_save_setup()) {
2848         return -1;
2849     }
2850
2851     /* migration has already setup the bitmap, reuse it. */
2852     if (!migration_in_colo_state()) {
2853         if (ram_init_all(rsp) != 0) {
2854             compress_threads_save_cleanup();
2855             return -1;
2856         }
2857     }
2858     (*rsp)->f = f;
2859
2860     rcu_read_lock();
2861
2862     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2863
2864     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2865         qemu_put_byte(f, strlen(block->idstr));
2866         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2867         qemu_put_be64(f, block->used_length);
2868         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2869             qemu_put_be64(f, block->page_size);
2870         }
2871     }
2872
2873     rcu_read_unlock();
2874
2875     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2876     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2877
2878     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2879
2880     return 0;
2881 }
2882
2883 /**
2884  * ram_save_iterate: iterative stage for migration
2885  *
2886  * Returns zero to indicate success and negative for error
2887  *
2888  * @f: QEMUFile where to send the data
2889  * @opaque: RAMState pointer
2890  */
2891 static int ram_save_iterate(QEMUFile *f, void *opaque)
2892 {
2893     RAMState **temp = opaque;
2894     RAMState *rs = *temp;
2895     int ret;
2896     int i;
2897     int64_t t0;
2898     int done = 0;
2899
2900     if (blk_mig_bulk_active()) {
2901         /* Avoid transferring ram during bulk phase of block migration as
2902          * the bulk phase will usually take a long time and transferring
2903          * ram updates during that time is pointless. */
2904         goto out;
2905     }
2906
2907     rcu_read_lock();
2908     if (ram_list.version != rs->last_version) {
2909         ram_state_reset(rs);
2910     }
2911
2912     /* Read version before ram_list.blocks */
2913     smp_rmb();
2914
2915     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2916
2917     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2918     i = 0;
2919     while ((ret = qemu_file_rate_limit(f)) == 0 ||
2920             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2921         int pages;
2922
2923         if (qemu_file_get_error(f)) {
2924             break;
2925         }
2926
2927         pages = ram_find_and_save_block(rs, false);
2928         /* no more pages to sent */
2929         if (pages == 0) {
2930             done = 1;
2931             break;
2932         }
2933         rs->iterations++;
2934
2935         /* we want to check in the 1st loop, just in case it was the 1st time
2936            and we had to sync the dirty bitmap.
2937            qemu_get_clock_ns() is a bit expensive, so we only check each some
2938            iterations
2939         */
2940         if ((i & 63) == 0) {
2941             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2942             if (t1 > MAX_WAIT) {
2943                 trace_ram_save_iterate_big_wait(t1, i);
2944                 break;
2945             }
2946         }
2947         i++;
2948     }
2949     flush_compressed_data(rs);
2950     rcu_read_unlock();
2951
2952     /*
2953      * Must occur before EOS (or any QEMUFile operation)
2954      * because of RDMA protocol.
2955      */
2956     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2957
2958 out:
2959     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2960     ram_counters.transferred += 8;
2961
2962     ret = qemu_file_get_error(f);
2963     if (ret < 0) {
2964         return ret;
2965     }
2966
2967     return done;
2968 }
2969
2970 /**
2971  * ram_save_complete: function called to send the remaining amount of ram
2972  *
2973  * Returns zero to indicate success
2974  *
2975  * Called with iothread lock
2976  *
2977  * @f: QEMUFile where to send the data
2978  * @opaque: RAMState pointer
2979  */
2980 static int ram_save_complete(QEMUFile *f, void *opaque)
2981 {
2982     RAMState **temp = opaque;
2983     RAMState *rs = *temp;
2984
2985     rcu_read_lock();
2986
2987     if (!migration_in_postcopy()) {
2988         migration_bitmap_sync(rs);
2989     }
2990
2991     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2992
2993     /* try transferring iterative blocks of memory */
2994
2995     /* flush all remaining blocks regardless of rate limiting */
2996     while (true) {
2997         int pages;
2998
2999         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3000         /* no more blocks to sent */
3001         if (pages == 0) {
3002             break;
3003         }
3004     }
3005
3006     flush_compressed_data(rs);
3007     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3008
3009     rcu_read_unlock();
3010
3011     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3012
3013     return 0;
3014 }
3015
3016 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3017                              uint64_t *res_precopy_only,
3018                              uint64_t *res_compatible,
3019                              uint64_t *res_postcopy_only)
3020 {
3021     RAMState **temp = opaque;
3022     RAMState *rs = *temp;
3023     uint64_t remaining_size;
3024
3025     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3026
3027     if (!migration_in_postcopy() &&
3028         remaining_size < max_size) {
3029         qemu_mutex_lock_iothread();
3030         rcu_read_lock();
3031         migration_bitmap_sync(rs);
3032         rcu_read_unlock();
3033         qemu_mutex_unlock_iothread();
3034         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3035     }
3036
3037     if (migrate_postcopy_ram()) {
3038         /* We can do postcopy, and all the data is postcopiable */
3039         *res_compatible += remaining_size;
3040     } else {
3041         *res_precopy_only += remaining_size;
3042     }
3043 }
3044
3045 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3046 {
3047     unsigned int xh_len;
3048     int xh_flags;
3049     uint8_t *loaded_data;
3050
3051     /* extract RLE header */
3052     xh_flags = qemu_get_byte(f);
3053     xh_len = qemu_get_be16(f);
3054
3055     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3056         error_report("Failed to load XBZRLE page - wrong compression!");
3057         return -1;
3058     }
3059
3060     if (xh_len > TARGET_PAGE_SIZE) {
3061         error_report("Failed to load XBZRLE page - len overflow!");
3062         return -1;
3063     }
3064     loaded_data = XBZRLE.decoded_buf;
3065     /* load data and decode */
3066     /* it can change loaded_data to point to an internal buffer */
3067     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3068
3069     /* decode RLE */
3070     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3071                              TARGET_PAGE_SIZE) == -1) {
3072         error_report("Failed to load XBZRLE page - decode error!");
3073         return -1;
3074     }
3075
3076     return 0;
3077 }
3078
3079 /**
3080  * ram_block_from_stream: read a RAMBlock id from the migration stream
3081  *
3082  * Must be called from within a rcu critical section.
3083  *
3084  * Returns a pointer from within the RCU-protected ram_list.
3085  *
3086  * @f: QEMUFile where to read the data from
3087  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3088  */
3089 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3090 {
3091     static RAMBlock *block = NULL;
3092     char id[256];
3093     uint8_t len;
3094
3095     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3096         if (!block) {
3097             error_report("Ack, bad migration stream!");
3098             return NULL;
3099         }
3100         return block;
3101     }
3102
3103     len = qemu_get_byte(f);
3104     qemu_get_buffer(f, (uint8_t *)id, len);
3105     id[len] = 0;
3106
3107     block = qemu_ram_block_by_name(id);
3108     if (!block) {
3109         error_report("Can't find block %s", id);
3110         return NULL;
3111     }
3112
3113     if (!qemu_ram_is_migratable(block)) {
3114         error_report("block %s should not be migrated !", id);
3115         return NULL;
3116     }
3117
3118     return block;
3119 }
3120
3121 static inline void *host_from_ram_block_offset(RAMBlock *block,
3122                                                ram_addr_t offset)
3123 {
3124     if (!offset_in_ramblock(block, offset)) {
3125         return NULL;
3126     }
3127
3128     return block->host + offset;
3129 }
3130
3131 /**
3132  * ram_handle_compressed: handle the zero page case
3133  *
3134  * If a page (or a whole RDMA chunk) has been
3135  * determined to be zero, then zap it.
3136  *
3137  * @host: host address for the zero page
3138  * @ch: what the page is filled from.  We only support zero
3139  * @size: size of the zero page
3140  */
3141 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3142 {
3143     if (ch != 0 || !is_zero_range(host, size)) {
3144         memset(host, ch, size);
3145     }
3146 }
3147
3148 /* return the size after decompression, or negative value on error */
3149 static int
3150 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3151                      const uint8_t *source, size_t source_len)
3152 {
3153     int err;
3154
3155     err = inflateReset(stream);
3156     if (err != Z_OK) {
3157         return -1;
3158     }
3159
3160     stream->avail_in = source_len;
3161     stream->next_in = (uint8_t *)source;
3162     stream->avail_out = dest_len;
3163     stream->next_out = dest;
3164
3165     err = inflate(stream, Z_NO_FLUSH);
3166     if (err != Z_STREAM_END) {
3167         return -1;
3168     }
3169
3170     return stream->total_out;
3171 }
3172
3173 static void *do_data_decompress(void *opaque)
3174 {
3175     DecompressParam *param = opaque;
3176     unsigned long pagesize;
3177     uint8_t *des;
3178     int len, ret;
3179
3180     qemu_mutex_lock(&param->mutex);
3181     while (!param->quit) {
3182         if (param->des) {
3183             des = param->des;
3184             len = param->len;
3185             param->des = 0;
3186             qemu_mutex_unlock(&param->mutex);
3187
3188             pagesize = TARGET_PAGE_SIZE;
3189
3190             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3191                                        param->compbuf, len);
3192             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3193                 error_report("decompress data failed");
3194                 qemu_file_set_error(decomp_file, ret);
3195             }
3196
3197             qemu_mutex_lock(&decomp_done_lock);
3198             param->done = true;
3199             qemu_cond_signal(&decomp_done_cond);
3200             qemu_mutex_unlock(&decomp_done_lock);
3201
3202             qemu_mutex_lock(&param->mutex);
3203         } else {
3204             qemu_cond_wait(&param->cond, &param->mutex);
3205         }
3206     }
3207     qemu_mutex_unlock(&param->mutex);
3208
3209     return NULL;
3210 }
3211
3212 static int wait_for_decompress_done(void)
3213 {
3214     int idx, thread_count;
3215
3216     if (!migrate_use_compression()) {
3217         return 0;
3218     }
3219
3220     thread_count = migrate_decompress_threads();
3221     qemu_mutex_lock(&decomp_done_lock);
3222     for (idx = 0; idx < thread_count; idx++) {
3223         while (!decomp_param[idx].done) {
3224             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3225         }
3226     }
3227     qemu_mutex_unlock(&decomp_done_lock);
3228     return qemu_file_get_error(decomp_file);
3229 }
3230
3231 static void compress_threads_load_cleanup(void)
3232 {
3233     int i, thread_count;
3234
3235     if (!migrate_use_compression()) {
3236         return;
3237     }
3238     thread_count = migrate_decompress_threads();
3239     for (i = 0; i < thread_count; i++) {
3240         /*
3241          * we use it as a indicator which shows if the thread is
3242          * properly init'd or not
3243          */
3244         if (!decomp_param[i].compbuf) {
3245             break;
3246         }
3247
3248         qemu_mutex_lock(&decomp_param[i].mutex);
3249         decomp_param[i].quit = true;
3250         qemu_cond_signal(&decomp_param[i].cond);
3251         qemu_mutex_unlock(&decomp_param[i].mutex);
3252     }
3253     for (i = 0; i < thread_count; i++) {
3254         if (!decomp_param[i].compbuf) {
3255             break;
3256         }
3257
3258         qemu_thread_join(decompress_threads + i);
3259         qemu_mutex_destroy(&decomp_param[i].mutex);
3260         qemu_cond_destroy(&decomp_param[i].cond);
3261         inflateEnd(&decomp_param[i].stream);
3262         g_free(decomp_param[i].compbuf);
3263         decomp_param[i].compbuf = NULL;
3264     }
3265     g_free(decompress_threads);
3266     g_free(decomp_param);
3267     decompress_threads = NULL;
3268     decomp_param = NULL;
3269     decomp_file = NULL;
3270 }
3271
3272 static int compress_threads_load_setup(QEMUFile *f)
3273 {
3274     int i, thread_count;
3275
3276     if (!migrate_use_compression()) {
3277         return 0;
3278     }
3279
3280     thread_count = migrate_decompress_threads();
3281     decompress_threads = g_new0(QemuThread, thread_count);
3282     decomp_param = g_new0(DecompressParam, thread_count);
3283     qemu_mutex_init(&decomp_done_lock);
3284     qemu_cond_init(&decomp_done_cond);
3285     decomp_file = f;
3286     for (i = 0; i < thread_count; i++) {
3287         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3288             goto exit;
3289         }
3290
3291         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3292         qemu_mutex_init(&decomp_param[i].mutex);
3293         qemu_cond_init(&decomp_param[i].cond);
3294         decomp_param[i].done = true;
3295         decomp_param[i].quit = false;
3296         qemu_thread_create(decompress_threads + i, "decompress",
3297                            do_data_decompress, decomp_param + i,
3298                            QEMU_THREAD_JOINABLE);
3299     }
3300     return 0;
3301 exit:
3302     compress_threads_load_cleanup();
3303     return -1;
3304 }
3305
3306 static void decompress_data_with_multi_threads(QEMUFile *f,
3307                                                void *host, int len)
3308 {
3309     int idx, thread_count;
3310
3311     thread_count = migrate_decompress_threads();
3312     qemu_mutex_lock(&decomp_done_lock);
3313     while (true) {
3314         for (idx = 0; idx < thread_count; idx++) {
3315             if (decomp_param[idx].done) {
3316                 decomp_param[idx].done = false;
3317                 qemu_mutex_lock(&decomp_param[idx].mutex);
3318                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3319                 decomp_param[idx].des = host;
3320                 decomp_param[idx].len = len;
3321                 qemu_cond_signal(&decomp_param[idx].cond);
3322                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3323                 break;
3324             }
3325         }
3326         if (idx < thread_count) {
3327             break;
3328         } else {
3329             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3330         }
3331     }
3332     qemu_mutex_unlock(&decomp_done_lock);
3333 }
3334
3335 /**
3336  * ram_load_setup: Setup RAM for migration incoming side
3337  *
3338  * Returns zero to indicate success and negative for error
3339  *
3340  * @f: QEMUFile where to receive the data
3341  * @opaque: RAMState pointer
3342  */
3343 static int ram_load_setup(QEMUFile *f, void *opaque)
3344 {
3345     if (compress_threads_load_setup(f)) {
3346         return -1;
3347     }
3348
3349     xbzrle_load_setup();
3350     ramblock_recv_map_init();
3351     return 0;
3352 }
3353
3354 static int ram_load_cleanup(void *opaque)
3355 {
3356     RAMBlock *rb;
3357     xbzrle_load_cleanup();
3358     compress_threads_load_cleanup();
3359
3360     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3361         g_free(rb->receivedmap);
3362         rb->receivedmap = NULL;
3363     }
3364     return 0;
3365 }
3366
3367 /**
3368  * ram_postcopy_incoming_init: allocate postcopy data structures
3369  *
3370  * Returns 0 for success and negative if there was one error
3371  *
3372  * @mis: current migration incoming state
3373  *
3374  * Allocate data structures etc needed by incoming migration with
3375  * postcopy-ram. postcopy-ram's similarly names
3376  * postcopy_ram_incoming_init does the work.
3377  */
3378 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3379 {
3380     unsigned long ram_pages = last_ram_page();
3381
3382     return postcopy_ram_incoming_init(mis, ram_pages);
3383 }
3384
3385 /**
3386  * ram_load_postcopy: load a page in postcopy case
3387  *
3388  * Returns 0 for success or -errno in case of error
3389  *
3390  * Called in postcopy mode by ram_load().
3391  * rcu_read_lock is taken prior to this being called.
3392  *
3393  * @f: QEMUFile where to send the data
3394  */
3395 static int ram_load_postcopy(QEMUFile *f)
3396 {
3397     int flags = 0, ret = 0;
3398     bool place_needed = false;
3399     bool matching_page_sizes = false;
3400     MigrationIncomingState *mis = migration_incoming_get_current();
3401     /* Temporary page that is later 'placed' */
3402     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3403     void *last_host = NULL;
3404     bool all_zero = false;
3405
3406     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3407         ram_addr_t addr;
3408         void *host = NULL;
3409         void *page_buffer = NULL;
3410         void *place_source = NULL;
3411         RAMBlock *block = NULL;
3412         uint8_t ch;
3413
3414         addr = qemu_get_be64(f);
3415
3416         /*
3417          * If qemu file error, we should stop here, and then "addr"
3418          * may be invalid
3419          */
3420         ret = qemu_file_get_error(f);
3421         if (ret) {
3422             break;
3423         }
3424
3425         flags = addr & ~TARGET_PAGE_MASK;
3426         addr &= TARGET_PAGE_MASK;
3427
3428         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3429         place_needed = false;
3430         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3431             block = ram_block_from_stream(f, flags);
3432
3433             host = host_from_ram_block_offset(block, addr);
3434             if (!host) {
3435                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3436                 ret = -EINVAL;
3437                 break;
3438             }
3439             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3440             /*
3441              * Postcopy requires that we place whole host pages atomically;
3442              * these may be huge pages for RAMBlocks that are backed by
3443              * hugetlbfs.
3444              * To make it atomic, the data is read into a temporary page
3445              * that's moved into place later.
3446              * The migration protocol uses,  possibly smaller, target-pages
3447              * however the source ensures it always sends all the components
3448              * of a host page in order.
3449              */
3450             page_buffer = postcopy_host_page +
3451                           ((uintptr_t)host & (block->page_size - 1));
3452             /* If all TP are zero then we can optimise the place */
3453             if (!((uintptr_t)host & (block->page_size - 1))) {
3454                 all_zero = true;
3455             } else {
3456                 /* not the 1st TP within the HP */
3457                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3458                     error_report("Non-sequential target page %p/%p",
3459                                   host, last_host);
3460                     ret = -EINVAL;
3461                     break;
3462                 }
3463             }
3464
3465
3466             /*
3467              * If it's the last part of a host page then we place the host
3468              * page
3469              */
3470             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3471                                      (block->page_size - 1)) == 0;
3472             place_source = postcopy_host_page;
3473         }
3474         last_host = host;
3475
3476         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3477         case RAM_SAVE_FLAG_ZERO:
3478             ch = qemu_get_byte(f);
3479             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3480             if (ch) {
3481                 all_zero = false;
3482             }
3483             break;
3484
3485         case RAM_SAVE_FLAG_PAGE:
3486             all_zero = false;
3487             if (!place_needed || !matching_page_sizes) {
3488                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3489             } else {
3490                 /* Avoids the qemu_file copy during postcopy, which is
3491                  * going to do a copy later; can only do it when we
3492                  * do this read in one go (matching page sizes)
3493                  */
3494                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3495                                          TARGET_PAGE_SIZE);
3496             }
3497             break;
3498         case RAM_SAVE_FLAG_EOS:
3499             /* normal exit */
3500             break;
3501         default:
3502             error_report("Unknown combination of migration flags: %#x"
3503                          " (postcopy mode)", flags);
3504             ret = -EINVAL;
3505             break;
3506         }
3507
3508         /* Detect for any possible file errors */
3509         if (!ret && qemu_file_get_error(f)) {
3510             ret = qemu_file_get_error(f);
3511         }
3512
3513         if (!ret && place_needed) {
3514             /* This gets called at the last target page in the host page */
3515             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3516
3517             if (all_zero) {
3518                 ret = postcopy_place_page_zero(mis, place_dest,
3519                                                block);
3520             } else {
3521                 ret = postcopy_place_page(mis, place_dest,
3522                                           place_source, block);
3523             }
3524         }
3525     }
3526
3527     return ret;
3528 }
3529
3530 static bool postcopy_is_advised(void)
3531 {
3532     PostcopyState ps = postcopy_state_get();
3533     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3534 }
3535
3536 static bool postcopy_is_running(void)
3537 {
3538     PostcopyState ps = postcopy_state_get();
3539     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3540 }
3541
3542 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3543 {
3544     int flags = 0, ret = 0, invalid_flags = 0;
3545     static uint64_t seq_iter;
3546     int len = 0;
3547     /*
3548      * If system is running in postcopy mode, page inserts to host memory must
3549      * be atomic
3550      */
3551     bool postcopy_running = postcopy_is_running();
3552     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3553     bool postcopy_advised = postcopy_is_advised();
3554
3555     seq_iter++;
3556
3557     if (version_id != 4) {
3558         ret = -EINVAL;
3559     }
3560
3561     if (!migrate_use_compression()) {
3562         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3563     }
3564     /* This RCU critical section can be very long running.
3565      * When RCU reclaims in the code start to become numerous,
3566      * it will be necessary to reduce the granularity of this
3567      * critical section.
3568      */
3569     rcu_read_lock();
3570
3571     if (postcopy_running) {
3572         ret = ram_load_postcopy(f);
3573     }
3574
3575     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3576         ram_addr_t addr, total_ram_bytes;
3577         void *host = NULL;
3578         uint8_t ch;
3579
3580         addr = qemu_get_be64(f);
3581         flags = addr & ~TARGET_PAGE_MASK;
3582         addr &= TARGET_PAGE_MASK;
3583
3584         if (flags & invalid_flags) {
3585             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3586                 error_report("Received an unexpected compressed page");
3587             }
3588
3589             ret = -EINVAL;
3590             break;
3591         }
3592
3593         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3594                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3595             RAMBlock *block = ram_block_from_stream(f, flags);
3596
3597             host = host_from_ram_block_offset(block, addr);
3598             if (!host) {
3599                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3600                 ret = -EINVAL;
3601                 break;
3602             }
3603             ramblock_recv_bitmap_set(block, host);
3604             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3605         }
3606
3607         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3608         case RAM_SAVE_FLAG_MEM_SIZE:
3609             /* Synchronize RAM block list */
3610             total_ram_bytes = addr;
3611             while (!ret && total_ram_bytes) {
3612                 RAMBlock *block;
3613                 char id[256];
3614                 ram_addr_t length;
3615
3616                 len = qemu_get_byte(f);
3617                 qemu_get_buffer(f, (uint8_t *)id, len);
3618                 id[len] = 0;
3619                 length = qemu_get_be64(f);
3620
3621                 block = qemu_ram_block_by_name(id);
3622                 if (block && !qemu_ram_is_migratable(block)) {
3623                     error_report("block %s should not be migrated !", id);
3624                     ret = -EINVAL;
3625                 } else if (block) {
3626                     if (length != block->used_length) {
3627                         Error *local_err = NULL;
3628
3629                         ret = qemu_ram_resize(block, length,
3630                                               &local_err);
3631                         if (local_err) {
3632                             error_report_err(local_err);
3633                         }
3634                     }
3635                     /* For postcopy we need to check hugepage sizes match */
3636                     if (postcopy_advised &&
3637                         block->page_size != qemu_host_page_size) {
3638                         uint64_t remote_page_size = qemu_get_be64(f);
3639                         if (remote_page_size != block->page_size) {
3640                             error_report("Mismatched RAM page size %s "
3641                                          "(local) %zd != %" PRId64,
3642                                          id, block->page_size,
3643                                          remote_page_size);
3644                             ret = -EINVAL;
3645                         }
3646                     }
3647                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3648                                           block->idstr);
3649                 } else {
3650                     error_report("Unknown ramblock \"%s\", cannot "
3651                                  "accept migration", id);
3652                     ret = -EINVAL;
3653                 }
3654
3655                 total_ram_bytes -= length;
3656             }
3657             break;
3658
3659         case RAM_SAVE_FLAG_ZERO:
3660             ch = qemu_get_byte(f);
3661             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3662             break;
3663
3664         case RAM_SAVE_FLAG_PAGE:
3665             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3666             break;
3667
3668         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3669             len = qemu_get_be32(f);
3670             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3671                 error_report("Invalid compressed data length: %d", len);
3672                 ret = -EINVAL;
3673                 break;
3674             }
3675             decompress_data_with_multi_threads(f, host, len);
3676             break;
3677
3678         case RAM_SAVE_FLAG_XBZRLE:
3679             if (load_xbzrle(f, addr, host) < 0) {
3680                 error_report("Failed to decompress XBZRLE page at "
3681                              RAM_ADDR_FMT, addr);
3682                 ret = -EINVAL;
3683                 break;
3684             }
3685             break;
3686         case RAM_SAVE_FLAG_EOS:
3687             /* normal exit */
3688             break;
3689         default:
3690             if (flags & RAM_SAVE_FLAG_HOOK) {
3691                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3692             } else {
3693                 error_report("Unknown combination of migration flags: %#x",
3694                              flags);
3695                 ret = -EINVAL;
3696             }
3697         }
3698         if (!ret) {
3699             ret = qemu_file_get_error(f);
3700         }
3701     }
3702
3703     ret |= wait_for_decompress_done();
3704     rcu_read_unlock();
3705     trace_ram_load_complete(ret, seq_iter);
3706     return ret;
3707 }
3708
3709 static bool ram_has_postcopy(void *opaque)
3710 {
3711     return migrate_postcopy_ram();
3712 }
3713
3714 /* Sync all the dirty bitmap with destination VM.  */
3715 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3716 {
3717     RAMBlock *block;
3718     QEMUFile *file = s->to_dst_file;
3719     int ramblock_count = 0;
3720
3721     trace_ram_dirty_bitmap_sync_start();
3722
3723     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3724         qemu_savevm_send_recv_bitmap(file, block->idstr);
3725         trace_ram_dirty_bitmap_request(block->idstr);
3726         ramblock_count++;
3727     }
3728
3729     trace_ram_dirty_bitmap_sync_wait();
3730
3731     /* Wait until all the ramblocks' dirty bitmap synced */
3732     while (ramblock_count--) {
3733         qemu_sem_wait(&s->rp_state.rp_sem);
3734     }
3735
3736     trace_ram_dirty_bitmap_sync_complete();
3737
3738     return 0;
3739 }
3740
3741 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3742 {
3743     qemu_sem_post(&s->rp_state.rp_sem);
3744 }
3745
3746 /*
3747  * Read the received bitmap, revert it as the initial dirty bitmap.
3748  * This is only used when the postcopy migration is paused but wants
3749  * to resume from a middle point.
3750  */
3751 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3752 {
3753     int ret = -EINVAL;
3754     QEMUFile *file = s->rp_state.from_dst_file;
3755     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3756     uint64_t local_size = nbits / 8;
3757     uint64_t size, end_mark;
3758
3759     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3760
3761     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3762         error_report("%s: incorrect state %s", __func__,
3763                      MigrationStatus_str(s->state));
3764         return -EINVAL;
3765     }
3766
3767     /*
3768      * Note: see comments in ramblock_recv_bitmap_send() on why we
3769      * need the endianess convertion, and the paddings.
3770      */
3771     local_size = ROUND_UP(local_size, 8);
3772
3773     /* Add paddings */
3774     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3775
3776     size = qemu_get_be64(file);
3777
3778     /* The size of the bitmap should match with our ramblock */
3779     if (size != local_size) {
3780         error_report("%s: ramblock '%s' bitmap size mismatch "
3781                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3782                      block->idstr, size, local_size);
3783         ret = -EINVAL;
3784         goto out;
3785     }
3786
3787     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3788     end_mark = qemu_get_be64(file);
3789
3790     ret = qemu_file_get_error(file);
3791     if (ret || size != local_size) {
3792         error_report("%s: read bitmap failed for ramblock '%s': %d"
3793                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3794                      __func__, block->idstr, ret, local_size, size);
3795         ret = -EIO;
3796         goto out;
3797     }
3798
3799     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3800         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3801                      __func__, block->idstr, end_mark);
3802         ret = -EINVAL;
3803         goto out;
3804     }
3805
3806     /*
3807      * Endianess convertion. We are during postcopy (though paused).
3808      * The dirty bitmap won't change. We can directly modify it.
3809      */
3810     bitmap_from_le(block->bmap, le_bitmap, nbits);
3811
3812     /*
3813      * What we received is "received bitmap". Revert it as the initial
3814      * dirty bitmap for this ramblock.
3815      */
3816     bitmap_complement(block->bmap, block->bmap, nbits);
3817
3818     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3819
3820     /*
3821      * We succeeded to sync bitmap for current ramblock. If this is
3822      * the last one to sync, we need to notify the main send thread.
3823      */
3824     ram_dirty_bitmap_reload_notify(s);
3825
3826     ret = 0;
3827 out:
3828     g_free(le_bitmap);
3829     return ret;
3830 }
3831
3832 static int ram_resume_prepare(MigrationState *s, void *opaque)
3833 {
3834     RAMState *rs = *(RAMState **)opaque;
3835     int ret;
3836
3837     ret = ram_dirty_bitmap_sync_all(s, rs);
3838     if (ret) {
3839         return ret;
3840     }
3841
3842     ram_state_resume_prepare(rs, s->to_dst_file);
3843
3844     return 0;
3845 }
3846
3847 static SaveVMHandlers savevm_ram_handlers = {
3848     .save_setup = ram_save_setup,
3849     .save_live_iterate = ram_save_iterate,
3850     .save_live_complete_postcopy = ram_save_complete,
3851     .save_live_complete_precopy = ram_save_complete,
3852     .has_postcopy = ram_has_postcopy,
3853     .save_live_pending = ram_save_pending,
3854     .load_state = ram_load,
3855     .save_cleanup = ram_save_cleanup,
3856     .load_setup = ram_load_setup,
3857     .load_cleanup = ram_load_cleanup,
3858     .resume_prepare = ram_resume_prepare,
3859 };
3860
3861 void ram_mig_init(void)
3862 {
3863     qemu_mutex_init(&XBZRLE.lock);
3864     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3865 }