migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <[email protected]>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "migration/postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46 #include "migration/colo.h"
  47
  48 /***********************************************************/
  49 /* ram save/restore */
  50
  51 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  52 #define RAM_SAVE_FLAG_COMPRESS 0x02
  53 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  54 #define RAM_SAVE_FLAG_PAGE     0x08
  55 #define RAM_SAVE_FLAG_EOS      0x10
  56 #define RAM_SAVE_FLAG_CONTINUE 0x20
  57 #define RAM_SAVE_FLAG_XBZRLE   0x40
  58 /* 0x80 is reserved in migration.h start with 0x100 next */
  59 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  60
  61 static uint8_t *ZERO_TARGET_PAGE;
  62
  63 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  64 {
  65     return buffer_is_zero(p, size);
  66 }
  67
  68 /* struct contains XBZRLE cache and a static page
  69    used by the compression */
  70 static struct {
  71     /* buffer used for XBZRLE encoding */
  72     uint8_t *encoded_buf;
  73     /* buffer for storing page content */
  74     uint8_t *current_buf;
  75     /* Cache for XBZRLE, Protected by lock. */
  76     PageCache *cache;
  77     QemuMutex lock;
  78 } XBZRLE;
  79
  80 /* buffer used for XBZRLE decoding */
  81 static uint8_t *xbzrle_decoded_buf;
  82
  83 static void XBZRLE_cache_lock(void)
  84 {
  85     if (migrate_use_xbzrle())
  86         qemu_mutex_lock(&XBZRLE.lock);
  87 }
  88
  89 static void XBZRLE_cache_unlock(void)
  90 {
  91     if (migrate_use_xbzrle())
  92         qemu_mutex_unlock(&XBZRLE.lock);
  93 }
  94
  95 /**
  96  * xbzrle_cache_resize: resize the xbzrle cache
  97  *
  98  * This function is called from qmp_migrate_set_cache_size in main
  99  * thread, possibly while a migration is in progress.  A running
 100  * migration may be using the cache and might finish during this call,
 101  * hence changes to the cache are protected by XBZRLE.lock().
 102  *
 103  * Returns the new_size or negative in case of error.
 104  *
 105  * @new_size: new cache size
 106  */
 107 int64_t xbzrle_cache_resize(int64_t new_size)
 108 {
 109     PageCache *new_cache;
 110     int64_t ret;
 111
 112     if (new_size < TARGET_PAGE_SIZE) {
 113         return -1;
 114     }
 115
 116     XBZRLE_cache_lock();
 117
 118     if (XBZRLE.cache != NULL) {
 119         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 120             goto out_new_size;
 121         }
 122         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 123                                         TARGET_PAGE_SIZE);
 124         if (!new_cache) {
 125             error_report("Error creating cache");
 126             ret = -1;
 127             goto out;
 128         }
 129
 130         cache_fini(XBZRLE.cache);
 131         XBZRLE.cache = new_cache;
 132     }
 133
 134 out_new_size:
 135     ret = pow2floor(new_size);
 136 out:
 137     XBZRLE_cache_unlock();
 138     return ret;
 139 }
 140
 141 /* State of RAM for migration */
 142 struct RAMState {
 143     /* Last block that we have visited searching for dirty pages */
 144     RAMBlock *last_seen_block;
 145     /* Last block from where we have sent data */
 146     RAMBlock *last_sent_block;
 147     /* Last offset we have sent data from */
 148     ram_addr_t last_offset;
 149     /* last ram version we have seen */
 150     uint32_t last_version;
 151     /* We are in the first round */
 152     bool ram_bulk_stage;
 153     /* How many times we have dirty too many pages */
 154     int dirty_rate_high_cnt;
 155     /* How many times we have synchronized the bitmap */
 156     uint64_t bitmap_sync_count;
 157     /* these variables are used for bitmap sync */
 158     /* last time we did a full bitmap_sync */
 159     int64_t time_last_bitmap_sync;
 160     /* bytes transferred at start_time */
 161     uint64_t bytes_xfer_prev;
 162     /* number of dirty pages since start_time */
 163     uint64_t num_dirty_pages_period;
 164     /* xbzrle misses since the beginning of the period */
 165     uint64_t xbzrle_cache_miss_prev;
 166 };
 167 typedef struct RAMState RAMState;
 168
 169 static RAMState ram_state;
 170
 171 /* accounting for migration statistics */
 172 typedef struct AccountingInfo {
 173     uint64_t dup_pages;
 174     uint64_t skipped_pages;
 175     uint64_t norm_pages;
 176     uint64_t iterations;
 177     uint64_t xbzrle_bytes;
 178     uint64_t xbzrle_pages;
 179     uint64_t xbzrle_cache_miss;
 180     double xbzrle_cache_miss_rate;
 181     uint64_t xbzrle_overflows;
 182 } AccountingInfo;
 183
 184 static AccountingInfo acct_info;
 185
 186 static void acct_clear(void)
 187 {
 188     memset(&acct_info, 0, sizeof(acct_info));
 189 }
 190
 191 uint64_t dup_mig_bytes_transferred(void)
 192 {
 193     return acct_info.dup_pages * TARGET_PAGE_SIZE;
 194 }
 195
 196 uint64_t dup_mig_pages_transferred(void)
 197 {
 198     return acct_info.dup_pages;
 199 }
 200
 201 uint64_t skipped_mig_bytes_transferred(void)
 202 {
 203     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 204 }
 205
 206 uint64_t skipped_mig_pages_transferred(void)
 207 {
 208     return acct_info.skipped_pages;
 209 }
 210
 211 uint64_t norm_mig_bytes_transferred(void)
 212 {
 213     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 214 }
 215
 216 uint64_t norm_mig_pages_transferred(void)
 217 {
 218     return acct_info.norm_pages;
 219 }
 220
 221 uint64_t xbzrle_mig_bytes_transferred(void)
 222 {
 223     return acct_info.xbzrle_bytes;
 224 }
 225
 226 uint64_t xbzrle_mig_pages_transferred(void)
 227 {
 228     return acct_info.xbzrle_pages;
 229 }
 230
 231 uint64_t xbzrle_mig_pages_cache_miss(void)
 232 {
 233     return acct_info.xbzrle_cache_miss;
 234 }
 235
 236 double xbzrle_mig_cache_miss_rate(void)
 237 {
 238     return acct_info.xbzrle_cache_miss_rate;
 239 }
 240
 241 uint64_t xbzrle_mig_pages_overflow(void)
 242 {
 243     return acct_info.xbzrle_overflows;
 244 }
 245
 246 static QemuMutex migration_bitmap_mutex;
 247 static uint64_t migration_dirty_pages;
 248
 249 /* used by the search for pages to send */
 250 struct PageSearchStatus {
 251     /* Current block being searched */
 252     RAMBlock    *block;
 253     /* Current offset to search from */
 254     ram_addr_t   offset;
 255     /* Set once we wrap around */
 256     bool         complete_round;
 257 };
 258 typedef struct PageSearchStatus PageSearchStatus;
 259
 260 static struct BitmapRcu {
 261     struct rcu_head rcu;
 262     /* Main migration bitmap */
 263     unsigned long *bmap;
 264     /* bitmap of pages that haven't been sent even once
 265      * only maintained and used in postcopy at the moment
 266      * where it's used to send the dirtymap at the start
 267      * of the postcopy phase
 268      */
 269     unsigned long *unsentmap;
 270 } *migration_bitmap_rcu;
 271
 272 struct CompressParam {
 273     bool done;
 274     bool quit;
 275     QEMUFile *file;
 276     QemuMutex mutex;
 277     QemuCond cond;
 278     RAMBlock *block;
 279     ram_addr_t offset;
 280 };
 281 typedef struct CompressParam CompressParam;
 282
 283 struct DecompressParam {
 284     bool done;
 285     bool quit;
 286     QemuMutex mutex;
 287     QemuCond cond;
 288     void *des;
 289     uint8_t *compbuf;
 290     int len;
 291 };
 292 typedef struct DecompressParam DecompressParam;
 293
 294 static CompressParam *comp_param;
 295 static QemuThread *compress_threads;
 296 /* comp_done_cond is used to wake up the migration thread when
 297  * one of the compression threads has finished the compression.
 298  * comp_done_lock is used to co-work with comp_done_cond.
 299  */
 300 static QemuMutex comp_done_lock;
 301 static QemuCond comp_done_cond;
 302 /* The empty QEMUFileOps will be used by file in CompressParam */
 303 static const QEMUFileOps empty_ops = { };
 304
 305 static bool compression_switch;
 306 static DecompressParam *decomp_param;
 307 static QemuThread *decompress_threads;
 308 static QemuMutex decomp_done_lock;
 309 static QemuCond decomp_done_cond;
 310
 311 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 312                                 ram_addr_t offset);
 313
 314 static void *do_data_compress(void *opaque)
 315 {
 316     CompressParam *param = opaque;
 317     RAMBlock *block;
 318     ram_addr_t offset;
 319
 320     qemu_mutex_lock(&param->mutex);
 321     while (!param->quit) {
 322         if (param->block) {
 323             block = param->block;
 324             offset = param->offset;
 325             param->block = NULL;
 326             qemu_mutex_unlock(&param->mutex);
 327
 328             do_compress_ram_page(param->file, block, offset);
 329
 330             qemu_mutex_lock(&comp_done_lock);
 331             param->done = true;
 332             qemu_cond_signal(&comp_done_cond);
 333             qemu_mutex_unlock(&comp_done_lock);
 334
 335             qemu_mutex_lock(&param->mutex);
 336         } else {
 337             qemu_cond_wait(&param->cond, &param->mutex);
 338         }
 339     }
 340     qemu_mutex_unlock(&param->mutex);
 341
 342     return NULL;
 343 }
 344
 345 static inline void terminate_compression_threads(void)
 346 {
 347     int idx, thread_count;
 348
 349     thread_count = migrate_compress_threads();
 350
 351     for (idx = 0; idx < thread_count; idx++) {
 352         qemu_mutex_lock(&comp_param[idx].mutex);
 353         comp_param[idx].quit = true;
 354         qemu_cond_signal(&comp_param[idx].cond);
 355         qemu_mutex_unlock(&comp_param[idx].mutex);
 356     }
 357 }
 358
 359 void migrate_compress_threads_join(void)
 360 {
 361     int i, thread_count;
 362
 363     if (!migrate_use_compression()) {
 364         return;
 365     }
 366     terminate_compression_threads();
 367     thread_count = migrate_compress_threads();
 368     for (i = 0; i < thread_count; i++) {
 369         qemu_thread_join(compress_threads + i);
 370         qemu_fclose(comp_param[i].file);
 371         qemu_mutex_destroy(&comp_param[i].mutex);
 372         qemu_cond_destroy(&comp_param[i].cond);
 373     }
 374     qemu_mutex_destroy(&comp_done_lock);
 375     qemu_cond_destroy(&comp_done_cond);
 376     g_free(compress_threads);
 377     g_free(comp_param);
 378     compress_threads = NULL;
 379     comp_param = NULL;
 380 }
 381
 382 void migrate_compress_threads_create(void)
 383 {
 384     int i, thread_count;
 385
 386     if (!migrate_use_compression()) {
 387         return;
 388     }
 389     compression_switch = true;
 390     thread_count = migrate_compress_threads();
 391     compress_threads = g_new0(QemuThread, thread_count);
 392     comp_param = g_new0(CompressParam, thread_count);
 393     qemu_cond_init(&comp_done_cond);
 394     qemu_mutex_init(&comp_done_lock);
 395     for (i = 0; i < thread_count; i++) {
 396         /* comp_param[i].file is just used as a dummy buffer to save data,
 397          * set its ops to empty.
 398          */
 399         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 400         comp_param[i].done = true;
 401         comp_param[i].quit = false;
 402         qemu_mutex_init(&comp_param[i].mutex);
 403         qemu_cond_init(&comp_param[i].cond);
 404         qemu_thread_create(compress_threads + i, "compress",
 405                            do_data_compress, comp_param + i,
 406                            QEMU_THREAD_JOINABLE);
 407     }
 408 }
 409
 410 /**
 411  * save_page_header: write page header to wire
 412  *
 413  * If this is the 1st block, it also writes the block identification
 414  *
 415  * Returns the number of bytes written
 416  *
 417  * @f: QEMUFile where to send the data
 418  * @block: block that contains the page we want to send
 419  * @offset: offset inside the block for the page
 420  *          in the lower bits, it contains flags
 421  */
 422 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 423 {
 424     size_t size, len;
 425
 426     qemu_put_be64(f, offset);
 427     size = 8;
 428
 429     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 430         len = strlen(block->idstr);
 431         qemu_put_byte(f, len);
 432         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 433         size += 1 + len;
 434     }
 435     return size;
 436 }
 437
 438 /**
 439  * mig_throttle_guest_down: throotle down the guest
 440  *
 441  * Reduce amount of guest cpu execution to hopefully slow down memory
 442  * writes. If guest dirty memory rate is reduced below the rate at
 443  * which we can transfer pages to the destination then we should be
 444  * able to complete migration. Some workloads dirty memory way too
 445  * fast and will not effectively converge, even with auto-converge.
 446  */
 447 static void mig_throttle_guest_down(void)
 448 {
 449     MigrationState *s = migrate_get_current();
 450     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 451     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 452
 453     /* We have not started throttling yet. Let's start it. */
 454     if (!cpu_throttle_active()) {
 455         cpu_throttle_set(pct_initial);
 456     } else {
 457         /* Throttling already on, just increase the rate */
 458         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 459     }
 460 }
 461
 462 /**
 463  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 464  *
 465  * @rs: current RAM state
 466  * @current_addr: address for the zero page
 467  *
 468  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 469  * The important thing is that a stale (not-yet-0'd) page be replaced
 470  * by the new data.
 471  * As a bonus, if the page wasn't in the cache it gets added so that
 472  * when a small write is made into the 0'd page it gets XBZRLE sent.
 473  */
 474 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 475 {
 476     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 477         return;
 478     }
 479
 480     /* We don't care if this fails to allocate a new cache page
 481      * as long as it updated an old one */
 482     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 483                  rs->bitmap_sync_count);
 484 }
 485
 486 #define ENCODING_FLAG_XBZRLE 0x1
 487
 488 /**
 489  * save_xbzrle_page: compress and send current page
 490  *
 491  * Returns: 1 means that we wrote the page
 492  *          0 means that page is identical to the one already sent
 493  *          -1 means that xbzrle would be longer than normal
 494  *
 495  * @rs: current RAM state
 496  * @f: QEMUFile where to send the data
 497  * @current_data: pointer to the address of the page contents
 498  * @current_addr: addr of the page
 499  * @block: block that contains the page we want to send
 500  * @offset: offset inside the block for the page
 501  * @last_stage: if we are at the completion stage
 502  * @bytes_transferred: increase it with the number of transferred bytes
 503  */
 504 static int save_xbzrle_page(RAMState *rs, QEMUFile *f, uint8_t **current_data,
 505                             ram_addr_t current_addr, RAMBlock *block,
 506                             ram_addr_t offset, bool last_stage,
 507                             uint64_t *bytes_transferred)
 508 {
 509     int encoded_len = 0, bytes_xbzrle;
 510     uint8_t *prev_cached_page;
 511
 512     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 513         acct_info.xbzrle_cache_miss++;
 514         if (!last_stage) {
 515             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 516                              rs->bitmap_sync_count) == -1) {
 517                 return -1;
 518             } else {
 519                 /* update *current_data when the page has been
 520                    inserted into cache */
 521                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 522             }
 523         }
 524         return -1;
 525     }
 526
 527     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 528
 529     /* save current buffer into memory */
 530     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 531
 532     /* XBZRLE encoding (if there is no overflow) */
 533     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 534                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 535                                        TARGET_PAGE_SIZE);
 536     if (encoded_len == 0) {
 537         trace_save_xbzrle_page_skipping();
 538         return 0;
 539     } else if (encoded_len == -1) {
 540         trace_save_xbzrle_page_overflow();
 541         acct_info.xbzrle_overflows++;
 542         /* update data in the cache */
 543         if (!last_stage) {
 544             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 545             *current_data = prev_cached_page;
 546         }
 547         return -1;
 548     }
 549
 550     /* we need to update the data in the cache, in order to get the same data */
 551     if (!last_stage) {
 552         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 553     }
 554
 555     /* Send XBZRLE based compressed page */
 556     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 557     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 558     qemu_put_be16(f, encoded_len);
 559     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 560     bytes_xbzrle += encoded_len + 1 + 2;
 561     acct_info.xbzrle_pages++;
 562     acct_info.xbzrle_bytes += bytes_xbzrle;
 563     *bytes_transferred += bytes_xbzrle;
 564
 565     return 1;
 566 }
 567
 568 /**
 569  * migration_bitmap_find_dirty: find the next dirty page from start
 570  *
 571  * Called with rcu_read_lock() to protect migration_bitmap
 572  *
 573  * Returns the byte offset within memory region of the start of a dirty page
 574  *
 575  * @rs: current RAM state
 576  * @rb: RAMBlock where to search for dirty pages
 577  * @start: starting address (typically so we can continue from previous page)
 578  * @ram_addr_abs: pointer into which to store the address of the dirty page
 579  *                within the global ram_addr space
 580  */
 581 static inline
 582 ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 583                                        ram_addr_t start,
 584                                        ram_addr_t *ram_addr_abs)
 585 {
 586     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 587     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 588     uint64_t rb_size = rb->used_length;
 589     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 590     unsigned long *bitmap;
 591
 592     unsigned long next;
 593
 594     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 595     if (rs->ram_bulk_stage && nr > base) {
 596         next = nr + 1;
 597     } else {
 598         next = find_next_bit(bitmap, size, nr);
 599     }
 600
 601     *ram_addr_abs = next << TARGET_PAGE_BITS;
 602     return (next - base) << TARGET_PAGE_BITS;
 603 }
 604
 605 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 606 {
 607     bool ret;
 608     int nr = addr >> TARGET_PAGE_BITS;
 609     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 610
 611     ret = test_and_clear_bit(nr, bitmap);
 612
 613     if (ret) {
 614         migration_dirty_pages--;
 615     }
 616     return ret;
 617 }
 618
 619 static void migration_bitmap_sync_range(RAMState *rs, ram_addr_t start,
 620                                         ram_addr_t length)
 621 {
 622     unsigned long *bitmap;
 623     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 624     migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
 625                              start, length, &rs->num_dirty_pages_period);
 626 }
 627
 628 /* Fix me: there are too many global variables used in migration process. */
 629 static uint64_t iterations_prev;
 630
 631 static void migration_bitmap_sync_init(RAMState *rs)
 632 {
 633     rs->time_last_bitmap_sync = 0;
 634     rs->bytes_xfer_prev = 0;
 635     rs->num_dirty_pages_period = 0;
 636     rs->xbzrle_cache_miss_prev = 0;
 637     iterations_prev = 0;
 638 }
 639
 640 /**
 641  * ram_pagesize_summary: calculate all the pagesizes of a VM
 642  *
 643  * Returns a summary bitmap of the page sizes of all RAMBlocks
 644  *
 645  * For VMs with just normal pages this is equivalent to the host page
 646  * size. If it's got some huge pages then it's the OR of all the
 647  * different page sizes.
 648  */
 649 uint64_t ram_pagesize_summary(void)
 650 {
 651     RAMBlock *block;
 652     uint64_t summary = 0;
 653
 654     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 655         summary |= block->page_size;
 656     }
 657
 658     return summary;
 659 }
 660
 661 static void migration_bitmap_sync(RAMState *rs)
 662 {
 663     RAMBlock *block;
 664     MigrationState *s = migrate_get_current();
 665     int64_t end_time;
 666     uint64_t bytes_xfer_now;
 667
 668     rs->bitmap_sync_count++;
 669
 670     if (!rs->bytes_xfer_prev) {
 671         rs->bytes_xfer_prev = ram_bytes_transferred();
 672     }
 673
 674     if (!rs->time_last_bitmap_sync) {
 675         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 676     }
 677
 678     trace_migration_bitmap_sync_start();
 679     memory_global_dirty_log_sync();
 680
 681     qemu_mutex_lock(&migration_bitmap_mutex);
 682     rcu_read_lock();
 683     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 684         migration_bitmap_sync_range(rs, block->offset, block->used_length);
 685     }
 686     rcu_read_unlock();
 687     qemu_mutex_unlock(&migration_bitmap_mutex);
 688
 689     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 690
 691     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 692
 693     /* more than 1 second = 1000 millisecons */
 694     if (end_time > rs->time_last_bitmap_sync + 1000) {
 695         if (migrate_auto_converge()) {
 696             /* The following detection logic can be refined later. For now:
 697                Check to see if the dirtied bytes is 50% more than the approx.
 698                amount of bytes that just got transferred since the last time we
 699                were in this routine. If that happens twice, start or increase
 700                throttling */
 701             bytes_xfer_now = ram_bytes_transferred();
 702
 703             if (s->dirty_pages_rate &&
 704                (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 705                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 706                (rs->dirty_rate_high_cnt++ >= 2)) {
 707                     trace_migration_throttle();
 708                     rs->dirty_rate_high_cnt = 0;
 709                     mig_throttle_guest_down();
 710              }
 711              rs->bytes_xfer_prev = bytes_xfer_now;
 712         }
 713
 714         if (migrate_use_xbzrle()) {
 715             if (iterations_prev != acct_info.iterations) {
 716                 acct_info.xbzrle_cache_miss_rate =
 717                    (double)(acct_info.xbzrle_cache_miss -
 718                             rs->xbzrle_cache_miss_prev) /
 719                    (acct_info.iterations - iterations_prev);
 720             }
 721             iterations_prev = acct_info.iterations;
 722             rs->xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 723         }
 724         s->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 725             / (end_time - rs->time_last_bitmap_sync);
 726         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 727         rs->time_last_bitmap_sync = end_time;
 728         rs->num_dirty_pages_period = 0;
 729     }
 730     s->dirty_sync_count = rs->bitmap_sync_count;
 731     if (migrate_use_events()) {
 732         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 733     }
 734 }
 735
 736 /**
 737  * save_zero_page: send the zero page to the stream
 738  *
 739  * Returns the number of pages written.
 740  *
 741  * @f: QEMUFile where to send the data
 742  * @block: block that contains the page we want to send
 743  * @offset: offset inside the block for the page
 744  * @p: pointer to the page
 745  * @bytes_transferred: increase it with the number of transferred bytes
 746  */
 747 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 748                           uint8_t *p, uint64_t *bytes_transferred)
 749 {
 750     int pages = -1;
 751
 752     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 753         acct_info.dup_pages++;
 754         *bytes_transferred += save_page_header(f, block,
 755                                                offset | RAM_SAVE_FLAG_COMPRESS);
 756         qemu_put_byte(f, 0);
 757         *bytes_transferred += 1;
 758         pages = 1;
 759     }
 760
 761     return pages;
 762 }
 763
 764 static void ram_release_pages(MigrationState *ms, const char *rbname,
 765                               uint64_t offset, int pages)
 766 {
 767     if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
 768         return;
 769     }
 770
 771     ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
 772 }
 773
 774 /**
 775  * ram_save_page: send the given page to the stream
 776  *
 777  * Returns the number of pages written.
 778  *          < 0 - error
 779  *          >=0 - Number of pages written - this might legally be 0
 780  *                if xbzrle noticed the page was the same.
 781  *
 782  * @rs: current RAM state
 783  * @ms: current migration state
 784  * @f: QEMUFile where to send the data
 785  * @block: block that contains the page we want to send
 786  * @offset: offset inside the block for the page
 787  * @last_stage: if we are at the completion stage
 788  * @bytes_transferred: increase it with the number of transferred bytes
 789  */
 790 static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
 791                          PageSearchStatus *pss, bool last_stage,
 792                          uint64_t *bytes_transferred)
 793 {
 794     int pages = -1;
 795     uint64_t bytes_xmit;
 796     ram_addr_t current_addr;
 797     uint8_t *p;
 798     int ret;
 799     bool send_async = true;
 800     RAMBlock *block = pss->block;
 801     ram_addr_t offset = pss->offset;
 802
 803     p = block->host + offset;
 804
 805     /* In doubt sent page as normal */
 806     bytes_xmit = 0;
 807     ret = ram_control_save_page(f, block->offset,
 808                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 809     if (bytes_xmit) {
 810         *bytes_transferred += bytes_xmit;
 811         pages = 1;
 812     }
 813
 814     XBZRLE_cache_lock();
 815
 816     current_addr = block->offset + offset;
 817
 818     if (block == rs->last_sent_block) {
 819         offset |= RAM_SAVE_FLAG_CONTINUE;
 820     }
 821     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 822         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 823             if (bytes_xmit > 0) {
 824                 acct_info.norm_pages++;
 825             } else if (bytes_xmit == 0) {
 826                 acct_info.dup_pages++;
 827             }
 828         }
 829     } else {
 830         pages = save_zero_page(f, block, offset, p, bytes_transferred);
 831         if (pages > 0) {
 832             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 833              * page would be stale
 834              */
 835             xbzrle_cache_zero_page(rs, current_addr);
 836             ram_release_pages(ms, block->idstr, pss->offset, pages);
 837         } else if (!rs->ram_bulk_stage &&
 838                    !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
 839             pages = save_xbzrle_page(rs, f, &p, current_addr, block,
 840                                      offset, last_stage, bytes_transferred);
 841             if (!last_stage) {
 842                 /* Can't send this cached data async, since the cache page
 843                  * might get updated before it gets to the wire
 844                  */
 845                 send_async = false;
 846             }
 847         }
 848     }
 849
 850     /* XBZRLE overflow or normal page */
 851     if (pages == -1) {
 852         *bytes_transferred += save_page_header(f, block,
 853                                                offset | RAM_SAVE_FLAG_PAGE);
 854         if (send_async) {
 855             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
 856                                   migrate_release_ram() &
 857                                   migration_in_postcopy(ms));
 858         } else {
 859             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 860         }
 861         *bytes_transferred += TARGET_PAGE_SIZE;
 862         pages = 1;
 863         acct_info.norm_pages++;
 864     }
 865
 866     XBZRLE_cache_unlock();
 867
 868     return pages;
 869 }
 870
 871 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 872                                 ram_addr_t offset)
 873 {
 874     int bytes_sent, blen;
 875     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 876
 877     bytes_sent = save_page_header(f, block, offset |
 878                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 879     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 880                                      migrate_compress_level());
 881     if (blen < 0) {
 882         bytes_sent = 0;
 883         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 884         error_report("compressed data failed!");
 885     } else {
 886         bytes_sent += blen;
 887         ram_release_pages(migrate_get_current(), block->idstr,
 888                           offset & TARGET_PAGE_MASK, 1);
 889     }
 890
 891     return bytes_sent;
 892 }
 893
 894 static uint64_t bytes_transferred;
 895
 896 static void flush_compressed_data(QEMUFile *f)
 897 {
 898     int idx, len, thread_count;
 899
 900     if (!migrate_use_compression()) {
 901         return;
 902     }
 903     thread_count = migrate_compress_threads();
 904
 905     qemu_mutex_lock(&comp_done_lock);
 906     for (idx = 0; idx < thread_count; idx++) {
 907         while (!comp_param[idx].done) {
 908             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 909         }
 910     }
 911     qemu_mutex_unlock(&comp_done_lock);
 912
 913     for (idx = 0; idx < thread_count; idx++) {
 914         qemu_mutex_lock(&comp_param[idx].mutex);
 915         if (!comp_param[idx].quit) {
 916             len = qemu_put_qemu_file(f, comp_param[idx].file);
 917             bytes_transferred += len;
 918         }
 919         qemu_mutex_unlock(&comp_param[idx].mutex);
 920     }
 921 }
 922
 923 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 924                                        ram_addr_t offset)
 925 {
 926     param->block = block;
 927     param->offset = offset;
 928 }
 929
 930 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 931                                            ram_addr_t offset,
 932                                            uint64_t *bytes_transferred)
 933 {
 934     int idx, thread_count, bytes_xmit = -1, pages = -1;
 935
 936     thread_count = migrate_compress_threads();
 937     qemu_mutex_lock(&comp_done_lock);
 938     while (true) {
 939         for (idx = 0; idx < thread_count; idx++) {
 940             if (comp_param[idx].done) {
 941                 comp_param[idx].done = false;
 942                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 943                 qemu_mutex_lock(&comp_param[idx].mutex);
 944                 set_compress_params(&comp_param[idx], block, offset);
 945                 qemu_cond_signal(&comp_param[idx].cond);
 946                 qemu_mutex_unlock(&comp_param[idx].mutex);
 947                 pages = 1;
 948                 acct_info.norm_pages++;
 949                 *bytes_transferred += bytes_xmit;
 950                 break;
 951             }
 952         }
 953         if (pages > 0) {
 954             break;
 955         } else {
 956             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 957         }
 958     }
 959     qemu_mutex_unlock(&comp_done_lock);
 960
 961     return pages;
 962 }
 963
 964 /**
 965  * ram_save_compressed_page: compress the given page and send it to the stream
 966  *
 967  * Returns the number of pages written.
 968  *
 969  * @rs: current RAM state
 970  * @ms: current migration state
 971  * @f: QEMUFile where to send the data
 972  * @block: block that contains the page we want to send
 973  * @offset: offset inside the block for the page
 974  * @last_stage: if we are at the completion stage
 975  * @bytes_transferred: increase it with the number of transferred bytes
 976  */
 977 static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
 978                                     QEMUFile *f,
 979                                     PageSearchStatus *pss, bool last_stage,
 980                                     uint64_t *bytes_transferred)
 981 {
 982     int pages = -1;
 983     uint64_t bytes_xmit = 0;
 984     uint8_t *p;
 985     int ret, blen;
 986     RAMBlock *block = pss->block;
 987     ram_addr_t offset = pss->offset;
 988
 989     p = block->host + offset;
 990
 991     ret = ram_control_save_page(f, block->offset,
 992                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 993     if (bytes_xmit) {
 994         *bytes_transferred += bytes_xmit;
 995         pages = 1;
 996     }
 997     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 998         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 999             if (bytes_xmit > 0) {
1000                 acct_info.norm_pages++;
1001             } else if (bytes_xmit == 0) {
1002                 acct_info.dup_pages++;
1003             }
1004         }
1005     } else {
1006         /* When starting the process of a new block, the first page of
1007          * the block should be sent out before other pages in the same
1008          * block, and all the pages in last block should have been sent
1009          * out, keeping this order is important, because the 'cont' flag
1010          * is used to avoid resending the block name.
1011          */
1012         if (block != rs->last_sent_block) {
1013             flush_compressed_data(f);
1014             pages = save_zero_page(f, block, offset, p, bytes_transferred);
1015             if (pages == -1) {
1016                 /* Make sure the first page is sent out before other pages */
1017                 bytes_xmit = save_page_header(f, block, offset |
1018                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1019                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1020                                                  migrate_compress_level());
1021                 if (blen > 0) {
1022                     *bytes_transferred += bytes_xmit + blen;
1023                     acct_info.norm_pages++;
1024                     pages = 1;
1025                 } else {
1026                     qemu_file_set_error(f, blen);
1027                     error_report("compressed data failed!");
1028                 }
1029             }
1030             if (pages > 0) {
1031                 ram_release_pages(ms, block->idstr, pss->offset, pages);
1032             }
1033         } else {
1034             offset |= RAM_SAVE_FLAG_CONTINUE;
1035             pages = save_zero_page(f, block, offset, p, bytes_transferred);
1036             if (pages == -1) {
1037                 pages = compress_page_with_multi_thread(f, block, offset,
1038                                                         bytes_transferred);
1039             } else {
1040                 ram_release_pages(ms, block->idstr, pss->offset, pages);
1041             }
1042         }
1043     }
1044
1045     return pages;
1046 }
1047
1048 /**
1049  * find_dirty_block: find the next dirty page and update any state
1050  * associated with the search process.
1051  *
1052  * Returns if a page is found
1053  *
1054  * @rs: current RAM state
1055  * @f: QEMUFile where to send the data
1056  * @pss: data about the state of the current dirty page scan
1057  * @again: set to false if the search has scanned the whole of RAM
1058  * @ram_addr_abs: pointer into which to store the address of the dirty page
1059  *                within the global ram_addr space
1060  */
1061 static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
1062                              bool *again, ram_addr_t *ram_addr_abs)
1063 {
1064     pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
1065                                               ram_addr_abs);
1066     if (pss->complete_round && pss->block == rs->last_seen_block &&
1067         pss->offset >= rs->last_offset) {
1068         /*
1069          * We've been once around the RAM and haven't found anything.
1070          * Give up.
1071          */
1072         *again = false;
1073         return false;
1074     }
1075     if (pss->offset >= pss->block->used_length) {
1076         /* Didn't find anything in this RAM Block */
1077         pss->offset = 0;
1078         pss->block = QLIST_NEXT_RCU(pss->block, next);
1079         if (!pss->block) {
1080             /* Hit the end of the list */
1081             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1082             /* Flag that we've looped */
1083             pss->complete_round = true;
1084             rs->ram_bulk_stage = false;
1085             if (migrate_use_xbzrle()) {
1086                 /* If xbzrle is on, stop using the data compression at this
1087                  * point. In theory, xbzrle can do better than compression.
1088                  */
1089                 flush_compressed_data(f);
1090                 compression_switch = false;
1091             }
1092         }
1093         /* Didn't find anything this time, but try again on the new block */
1094         *again = true;
1095         return false;
1096     } else {
1097         /* Can go around again, but... */
1098         *again = true;
1099         /* We've found something so probably don't need to */
1100         return true;
1101     }
1102 }
1103
1104 /**
1105  * unqueue_page: gets a page of the queue
1106  *
1107  * Helper for 'get_queued_page' - gets a page off the queue
1108  *
1109  * Returns the block of the page (or NULL if none available)
1110  *
1111  * @ms: current migration state
1112  * @offset: used to return the offset within the RAMBlock
1113  * @ram_addr_abs: pointer into which to store the address of the dirty page
1114  *                within the global ram_addr space
1115  */
1116 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1117                               ram_addr_t *ram_addr_abs)
1118 {
1119     RAMBlock *block = NULL;
1120
1121     qemu_mutex_lock(&ms->src_page_req_mutex);
1122     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1123         struct MigrationSrcPageRequest *entry =
1124                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1125         block = entry->rb;
1126         *offset = entry->offset;
1127         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1128                         TARGET_PAGE_MASK;
1129
1130         if (entry->len > TARGET_PAGE_SIZE) {
1131             entry->len -= TARGET_PAGE_SIZE;
1132             entry->offset += TARGET_PAGE_SIZE;
1133         } else {
1134             memory_region_unref(block->mr);
1135             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1136             g_free(entry);
1137         }
1138     }
1139     qemu_mutex_unlock(&ms->src_page_req_mutex);
1140
1141     return block;
1142 }
1143
1144 /**
1145  * get_queued_page: unqueue a page from the postocpy requests
1146  *
1147  * Skips pages that are already sent (!dirty)
1148  *
1149  * Returns if a queued page is found
1150  *
1151  * @rs: current RAM state
1152  * @ms: current migration state
1153  * @pss: data about the state of the current dirty page scan
1154  * @ram_addr_abs: pointer into which to store the address of the dirty page
1155  *                within the global ram_addr space
1156  */
1157 static bool get_queued_page(RAMState *rs, MigrationState *ms,
1158                             PageSearchStatus *pss,
1159                             ram_addr_t *ram_addr_abs)
1160 {
1161     RAMBlock  *block;
1162     ram_addr_t offset;
1163     bool dirty;
1164
1165     do {
1166         block = unqueue_page(ms, &offset, ram_addr_abs);
1167         /*
1168          * We're sending this page, and since it's postcopy nothing else
1169          * will dirty it, and we must make sure it doesn't get sent again
1170          * even if this queue request was received after the background
1171          * search already sent it.
1172          */
1173         if (block) {
1174             unsigned long *bitmap;
1175             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1176             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1177             if (!dirty) {
1178                 trace_get_queued_page_not_dirty(
1179                     block->idstr, (uint64_t)offset,
1180                     (uint64_t)*ram_addr_abs,
1181                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1182                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1183             } else {
1184                 trace_get_queued_page(block->idstr,
1185                                       (uint64_t)offset,
1186                                       (uint64_t)*ram_addr_abs);
1187             }
1188         }
1189
1190     } while (block && !dirty);
1191
1192     if (block) {
1193         /*
1194          * As soon as we start servicing pages out of order, then we have
1195          * to kill the bulk stage, since the bulk stage assumes
1196          * in (migration_bitmap_find_and_reset_dirty) that every page is
1197          * dirty, that's no longer true.
1198          */
1199         rs->ram_bulk_stage = false;
1200
1201         /*
1202          * We want the background search to continue from the queued page
1203          * since the guest is likely to want other pages near to the page
1204          * it just requested.
1205          */
1206         pss->block = block;
1207         pss->offset = offset;
1208     }
1209
1210     return !!block;
1211 }
1212
1213 /**
1214  * migration_page_queue_free: drop any remaining pages in the ram
1215  * request queue
1216  *
1217  * It should be empty at the end anyway, but in error cases there may
1218  * be some left.  in case that there is any page left, we drop it.
1219  *
1220  * @ms: current migration state
1221  */
1222 void migration_page_queue_free(MigrationState *ms)
1223 {
1224     struct MigrationSrcPageRequest *mspr, *next_mspr;
1225     /* This queue generally should be empty - but in the case of a failed
1226      * migration might have some droppings in.
1227      */
1228     rcu_read_lock();
1229     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1230         memory_region_unref(mspr->rb->mr);
1231         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1232         g_free(mspr);
1233     }
1234     rcu_read_unlock();
1235 }
1236
1237 /**
1238  * ram_save_queue_pages: queue the page for transmission
1239  *
1240  * A request from postcopy destination for example.
1241  *
1242  * Returns zero on success or negative on error
1243  *
1244  * @ms: current migration state
1245  * @rbname: Name of the RAMBLock of the request. NULL means the
1246  *          same that last one.
1247  * @start: starting address from the start of the RAMBlock
1248  * @len: length (in bytes) to send
1249  */
1250 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1251                          ram_addr_t start, ram_addr_t len)
1252 {
1253     RAMBlock *ramblock;
1254
1255     ms->postcopy_requests++;
1256     rcu_read_lock();
1257     if (!rbname) {
1258         /* Reuse last RAMBlock */
1259         ramblock = ms->last_req_rb;
1260
1261         if (!ramblock) {
1262             /*
1263              * Shouldn't happen, we can't reuse the last RAMBlock if
1264              * it's the 1st request.
1265              */
1266             error_report("ram_save_queue_pages no previous block");
1267             goto err;
1268         }
1269     } else {
1270         ramblock = qemu_ram_block_by_name(rbname);
1271
1272         if (!ramblock) {
1273             /* We shouldn't be asked for a non-existent RAMBlock */
1274             error_report("ram_save_queue_pages no block '%s'", rbname);
1275             goto err;
1276         }
1277         ms->last_req_rb = ramblock;
1278     }
1279     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1280     if (start+len > ramblock->used_length) {
1281         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1282                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1283                      __func__, start, len, ramblock->used_length);
1284         goto err;
1285     }
1286
1287     struct MigrationSrcPageRequest *new_entry =
1288         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1289     new_entry->rb = ramblock;
1290     new_entry->offset = start;
1291     new_entry->len = len;
1292
1293     memory_region_ref(ramblock->mr);
1294     qemu_mutex_lock(&ms->src_page_req_mutex);
1295     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1296     qemu_mutex_unlock(&ms->src_page_req_mutex);
1297     rcu_read_unlock();
1298
1299     return 0;
1300
1301 err:
1302     rcu_read_unlock();
1303     return -1;
1304 }
1305
1306 /**
1307  * ram_save_target_page: save one target page
1308  *
1309  * Returns the number of pages written
1310  *
1311  * @rs: current RAM state
1312  * @ms: current migration state
1313  * @f: QEMUFile where to send the data
1314  * @pss: data about the page we want to send
1315  * @last_stage: if we are at the completion stage
1316  * @bytes_transferred: increase it with the number of transferred bytes
1317  * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
1318  */
1319 static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
1320                                 PageSearchStatus *pss,
1321                                 bool last_stage,
1322                                 uint64_t *bytes_transferred,
1323                                 ram_addr_t dirty_ram_abs)
1324 {
1325     int res = 0;
1326
1327     /* Check the pages is dirty and if it is send it */
1328     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1329         unsigned long *unsentmap;
1330         if (compression_switch && migrate_use_compression()) {
1331             res = ram_save_compressed_page(rs, ms, f, pss,
1332                                            last_stage,
1333                                            bytes_transferred);
1334         } else {
1335             res = ram_save_page(rs, ms, f, pss, last_stage,
1336                                 bytes_transferred);
1337         }
1338
1339         if (res < 0) {
1340             return res;
1341         }
1342         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1343         if (unsentmap) {
1344             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1345         }
1346         /* Only update last_sent_block if a block was actually sent; xbzrle
1347          * might have decided the page was identical so didn't bother writing
1348          * to the stream.
1349          */
1350         if (res > 0) {
1351             rs->last_sent_block = pss->block;
1352         }
1353     }
1354
1355     return res;
1356 }
1357
1358 /**
1359  * ram_save_host_page: save a whole host page
1360  *
1361  * Starting at *offset send pages up to the end of the current host
1362  * page. It's valid for the initial offset to point into the middle of
1363  * a host page in which case the remainder of the hostpage is sent.
1364  * Only dirty target pages are sent. Note that the host page size may
1365  * be a huge page for this block.
1366  *
1367  * Returns the number of pages written or negative on error
1368  *
1369  * @rs: current RAM state
1370  * @ms: current migration state
1371  * @f: QEMUFile where to send the data
1372  * @pss: data about the page we want to send
1373  * @last_stage: if we are at the completion stage
1374  * @bytes_transferred: increase it with the number of transferred bytes
1375  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1376  */
1377 static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
1378                               PageSearchStatus *pss,
1379                               bool last_stage,
1380                               uint64_t *bytes_transferred,
1381                               ram_addr_t dirty_ram_abs)
1382 {
1383     int tmppages, pages = 0;
1384     size_t pagesize = qemu_ram_pagesize(pss->block);
1385
1386     do {
1387         tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
1388                                         bytes_transferred, dirty_ram_abs);
1389         if (tmppages < 0) {
1390             return tmppages;
1391         }
1392
1393         pages += tmppages;
1394         pss->offset += TARGET_PAGE_SIZE;
1395         dirty_ram_abs += TARGET_PAGE_SIZE;
1396     } while (pss->offset & (pagesize - 1));
1397
1398     /* The offset we leave with is the last one we looked at */
1399     pss->offset -= TARGET_PAGE_SIZE;
1400     return pages;
1401 }
1402
1403 /**
1404  * ram_find_and_save_block: finds a dirty page and sends it to f
1405  *
1406  * Called within an RCU critical section.
1407  *
1408  * Returns the number of pages written where zero means no dirty pages
1409  *
1410  * @rs: current RAM state
1411  * @f: QEMUFile where to send the data
1412  * @last_stage: if we are at the completion stage
1413  * @bytes_transferred: increase it with the number of transferred bytes
1414  *
1415  * On systems where host-page-size > target-page-size it will send all the
1416  * pages in a host page that are dirty.
1417  */
1418
1419 static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage,
1420                                    uint64_t *bytes_transferred)
1421 {
1422     PageSearchStatus pss;
1423     MigrationState *ms = migrate_get_current();
1424     int pages = 0;
1425     bool again, found;
1426     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1427                                  ram_addr_t space */
1428
1429     /* No dirty page as there is zero RAM */
1430     if (!ram_bytes_total()) {
1431         return pages;
1432     }
1433
1434     pss.block = rs->last_seen_block;
1435     pss.offset = rs->last_offset;
1436     pss.complete_round = false;
1437
1438     if (!pss.block) {
1439         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1440     }
1441
1442     do {
1443         again = true;
1444         found = get_queued_page(rs, ms, &pss, &dirty_ram_abs);
1445
1446         if (!found) {
1447             /* priority queue empty, so just search for something dirty */
1448             found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
1449         }
1450
1451         if (found) {
1452             pages = ram_save_host_page(rs, ms, f, &pss,
1453                                        last_stage, bytes_transferred,
1454                                        dirty_ram_abs);
1455         }
1456     } while (!pages && again);
1457
1458     rs->last_seen_block = pss.block;
1459     rs->last_offset = pss.offset;
1460
1461     return pages;
1462 }
1463
1464 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1465 {
1466     uint64_t pages = size / TARGET_PAGE_SIZE;
1467     if (zero) {
1468         acct_info.dup_pages += pages;
1469     } else {
1470         acct_info.norm_pages += pages;
1471         bytes_transferred += size;
1472         qemu_update_position(f, size);
1473     }
1474 }
1475
1476 static ram_addr_t ram_save_remaining(void)
1477 {
1478     return migration_dirty_pages;
1479 }
1480
1481 uint64_t ram_bytes_remaining(void)
1482 {
1483     return ram_save_remaining() * TARGET_PAGE_SIZE;
1484 }
1485
1486 uint64_t ram_bytes_transferred(void)
1487 {
1488     return bytes_transferred;
1489 }
1490
1491 uint64_t ram_bytes_total(void)
1492 {
1493     RAMBlock *block;
1494     uint64_t total = 0;
1495
1496     rcu_read_lock();
1497     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1498         total += block->used_length;
1499     rcu_read_unlock();
1500     return total;
1501 }
1502
1503 void free_xbzrle_decoded_buf(void)
1504 {
1505     g_free(xbzrle_decoded_buf);
1506     xbzrle_decoded_buf = NULL;
1507 }
1508
1509 static void migration_bitmap_free(struct BitmapRcu *bmap)
1510 {
1511     g_free(bmap->bmap);
1512     g_free(bmap->unsentmap);
1513     g_free(bmap);
1514 }
1515
1516 static void ram_migration_cleanup(void *opaque)
1517 {
1518     /* caller have hold iothread lock or is in a bh, so there is
1519      * no writing race against this migration_bitmap
1520      */
1521     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1522     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1523     if (bitmap) {
1524         memory_global_dirty_log_stop();
1525         call_rcu(bitmap, migration_bitmap_free, rcu);
1526     }
1527
1528     XBZRLE_cache_lock();
1529     if (XBZRLE.cache) {
1530         cache_fini(XBZRLE.cache);
1531         g_free(XBZRLE.encoded_buf);
1532         g_free(XBZRLE.current_buf);
1533         g_free(ZERO_TARGET_PAGE);
1534         XBZRLE.cache = NULL;
1535         XBZRLE.encoded_buf = NULL;
1536         XBZRLE.current_buf = NULL;
1537     }
1538     XBZRLE_cache_unlock();
1539 }
1540
1541 static void ram_state_reset(RAMState *rs)
1542 {
1543     rs->last_seen_block = NULL;
1544     rs->last_sent_block = NULL;
1545     rs->last_offset = 0;
1546     rs->last_version = ram_list.version;
1547     rs->ram_bulk_stage = true;
1548 }
1549
1550 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1551
1552 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1553 {
1554     /* called in qemu main thread, so there is
1555      * no writing race against this migration_bitmap
1556      */
1557     if (migration_bitmap_rcu) {
1558         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1559         bitmap = g_new(struct BitmapRcu, 1);
1560         bitmap->bmap = bitmap_new(new);
1561
1562         /* prevent migration_bitmap content from being set bit
1563          * by migration_bitmap_sync_range() at the same time.
1564          * it is safe to migration if migration_bitmap is cleared bit
1565          * at the same time.
1566          */
1567         qemu_mutex_lock(&migration_bitmap_mutex);
1568         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1569         bitmap_set(bitmap->bmap, old, new - old);
1570
1571         /* We don't have a way to safely extend the sentmap
1572          * with RCU; so mark it as missing, entry to postcopy
1573          * will fail.
1574          */
1575         bitmap->unsentmap = NULL;
1576
1577         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1578         qemu_mutex_unlock(&migration_bitmap_mutex);
1579         migration_dirty_pages += new - old;
1580         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1581     }
1582 }
1583
1584 /*
1585  * 'expected' is the value you expect the bitmap mostly to be full
1586  * of; it won't bother printing lines that are all this value.
1587  * If 'todump' is null the migration bitmap is dumped.
1588  */
1589 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1590 {
1591     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1592
1593     int64_t cur;
1594     int64_t linelen = 128;
1595     char linebuf[129];
1596
1597     if (!todump) {
1598         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1599     }
1600
1601     for (cur = 0; cur < ram_pages; cur += linelen) {
1602         int64_t curb;
1603         bool found = false;
1604         /*
1605          * Last line; catch the case where the line length
1606          * is longer than remaining ram
1607          */
1608         if (cur + linelen > ram_pages) {
1609             linelen = ram_pages - cur;
1610         }
1611         for (curb = 0; curb < linelen; curb++) {
1612             bool thisbit = test_bit(cur + curb, todump);
1613             linebuf[curb] = thisbit ? '1' : '.';
1614             found = found || (thisbit != expected);
1615         }
1616         if (found) {
1617             linebuf[curb] = '\0';
1618             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1619         }
1620     }
1621 }
1622
1623 /* **** functions for postcopy ***** */
1624
1625 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1626 {
1627     struct RAMBlock *block;
1628     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1629
1630     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1631         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1632         unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1633         unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1634
1635         while (run_start < range) {
1636             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1637             ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1638                               (run_end - run_start) << TARGET_PAGE_BITS);
1639             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1640         }
1641     }
1642 }
1643
1644 /**
1645  * postcopy_send_discard_bm_ram: discard a RAMBlock
1646  *
1647  * Returns zero on success
1648  *
1649  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1650  * Note: At this point the 'unsentmap' is the processed bitmap combined
1651  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1652  *
1653  * @ms: current migration state
1654  * @pds: state for postcopy
1655  * @start: RAMBlock starting page
1656  * @length: RAMBlock size
1657  */
1658 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1659                                         PostcopyDiscardState *pds,
1660                                         unsigned long start,
1661                                         unsigned long length)
1662 {
1663     unsigned long end = start + length; /* one after the end */
1664     unsigned long current;
1665     unsigned long *unsentmap;
1666
1667     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1668     for (current = start; current < end; ) {
1669         unsigned long one = find_next_bit(unsentmap, end, current);
1670
1671         if (one <= end) {
1672             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1673             unsigned long discard_length;
1674
1675             if (zero >= end) {
1676                 discard_length = end - one;
1677             } else {
1678                 discard_length = zero - one;
1679             }
1680             if (discard_length) {
1681                 postcopy_discard_send_range(ms, pds, one, discard_length);
1682             }
1683             current = one + discard_length;
1684         } else {
1685             current = one;
1686         }
1687     }
1688
1689     return 0;
1690 }
1691
1692 /**
1693  * postcopy_each_ram_send_discard: discard all RAMBlocks
1694  *
1695  * Returns 0 for success or negative for error
1696  *
1697  * Utility for the outgoing postcopy code.
1698  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1699  *   passing it bitmap indexes and name.
1700  * (qemu_ram_foreach_block ends up passing unscaled lengths
1701  *  which would mean postcopy code would have to deal with target page)
1702  *
1703  * @ms: current migration state
1704  */
1705 static int postcopy_each_ram_send_discard(MigrationState *ms)
1706 {
1707     struct RAMBlock *block;
1708     int ret;
1709
1710     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1711         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1712         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1713                                                                first,
1714                                                                block->idstr);
1715
1716         /*
1717          * Postcopy sends chunks of bitmap over the wire, but it
1718          * just needs indexes at this point, avoids it having
1719          * target page specific code.
1720          */
1721         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1722                                     block->used_length >> TARGET_PAGE_BITS);
1723         postcopy_discard_send_finish(ms, pds);
1724         if (ret) {
1725             return ret;
1726         }
1727     }
1728
1729     return 0;
1730 }
1731
1732 /**
1733  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1734  *
1735  * Helper for postcopy_chunk_hostpages; it's called twice to
1736  * canonicalize the two bitmaps, that are similar, but one is
1737  * inverted.
1738  *
1739  * Postcopy requires that all target pages in a hostpage are dirty or
1740  * clean, not a mix.  This function canonicalizes the bitmaps.
1741  *
1742  * @ms: current migration state
1743  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1744  *               otherwise we need to canonicalize partially dirty host pages
1745  * @block: block that contains the page we want to canonicalize
1746  * @pds: state for postcopy
1747  */
1748 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1749                                           RAMBlock *block,
1750                                           PostcopyDiscardState *pds)
1751 {
1752     unsigned long *bitmap;
1753     unsigned long *unsentmap;
1754     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1755     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1756     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1757     unsigned long last = first + (len - 1);
1758     unsigned long run_start;
1759
1760     if (block->page_size == TARGET_PAGE_SIZE) {
1761         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1762         return;
1763     }
1764
1765     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1766     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1767
1768     if (unsent_pass) {
1769         /* Find a sent page */
1770         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1771     } else {
1772         /* Find a dirty page */
1773         run_start = find_next_bit(bitmap, last + 1, first);
1774     }
1775
1776     while (run_start <= last) {
1777         bool do_fixup = false;
1778         unsigned long fixup_start_addr;
1779         unsigned long host_offset;
1780
1781         /*
1782          * If the start of this run of pages is in the middle of a host
1783          * page, then we need to fixup this host page.
1784          */
1785         host_offset = run_start % host_ratio;
1786         if (host_offset) {
1787             do_fixup = true;
1788             run_start -= host_offset;
1789             fixup_start_addr = run_start;
1790             /* For the next pass */
1791             run_start = run_start + host_ratio;
1792         } else {
1793             /* Find the end of this run */
1794             unsigned long run_end;
1795             if (unsent_pass) {
1796                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1797             } else {
1798                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1799             }
1800             /*
1801              * If the end isn't at the start of a host page, then the
1802              * run doesn't finish at the end of a host page
1803              * and we need to discard.
1804              */
1805             host_offset = run_end % host_ratio;
1806             if (host_offset) {
1807                 do_fixup = true;
1808                 fixup_start_addr = run_end - host_offset;
1809                 /*
1810                  * This host page has gone, the next loop iteration starts
1811                  * from after the fixup
1812                  */
1813                 run_start = fixup_start_addr + host_ratio;
1814             } else {
1815                 /*
1816                  * No discards on this iteration, next loop starts from
1817                  * next sent/dirty page
1818                  */
1819                 run_start = run_end + 1;
1820             }
1821         }
1822
1823         if (do_fixup) {
1824             unsigned long page;
1825
1826             /* Tell the destination to discard this page */
1827             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1828                 /* For the unsent_pass we:
1829                  *     discard partially sent pages
1830                  * For the !unsent_pass (dirty) we:
1831                  *     discard partially dirty pages that were sent
1832                  *     (any partially sent pages were already discarded
1833                  *     by the previous unsent_pass)
1834                  */
1835                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1836                                             host_ratio);
1837             }
1838
1839             /* Clean up the bitmap */
1840             for (page = fixup_start_addr;
1841                  page < fixup_start_addr + host_ratio; page++) {
1842                 /* All pages in this host page are now not sent */
1843                 set_bit(page, unsentmap);
1844
1845                 /*
1846                  * Remark them as dirty, updating the count for any pages
1847                  * that weren't previously dirty.
1848                  */
1849                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1850             }
1851         }
1852
1853         if (unsent_pass) {
1854             /* Find the next sent page for the next iteration */
1855             run_start = find_next_zero_bit(unsentmap, last + 1,
1856                                            run_start);
1857         } else {
1858             /* Find the next dirty page for the next iteration */
1859             run_start = find_next_bit(bitmap, last + 1, run_start);
1860         }
1861     }
1862 }
1863
1864 /**
1865  * postcopy_chuck_hostpages: discrad any partially sent host page
1866  *
1867  * Utility for the outgoing postcopy code.
1868  *
1869  * Discard any partially sent host-page size chunks, mark any partially
1870  * dirty host-page size chunks as all dirty.  In this case the host-page
1871  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1872  *
1873  * Returns zero on success
1874  *
1875  * @ms: current migration state
1876  */
1877 static int postcopy_chunk_hostpages(MigrationState *ms)
1878 {
1879     RAMState *rs = &ram_state;
1880     struct RAMBlock *block;
1881
1882     /* Easiest way to make sure we don't resume in the middle of a host-page */
1883     rs->last_seen_block = NULL;
1884     rs->last_sent_block = NULL;
1885     rs->last_offset     = 0;
1886
1887     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1888         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1889
1890         PostcopyDiscardState *pds =
1891                          postcopy_discard_send_init(ms, first, block->idstr);
1892
1893         /* First pass: Discard all partially sent host pages */
1894         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1895         /*
1896          * Second pass: Ensure that all partially dirty host pages are made
1897          * fully dirty.
1898          */
1899         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1900
1901         postcopy_discard_send_finish(ms, pds);
1902     } /* ram_list loop */
1903
1904     return 0;
1905 }
1906
1907 /**
1908  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1909  *
1910  * Returns zero on success
1911  *
1912  * Transmit the set of pages to be discarded after precopy to the target
1913  * these are pages that:
1914  *     a) Have been previously transmitted but are now dirty again
1915  *     b) Pages that have never been transmitted, this ensures that
1916  *        any pages on the destination that have been mapped by background
1917  *        tasks get discarded (transparent huge pages is the specific concern)
1918  * Hopefully this is pretty sparse
1919  *
1920  * @ms: current migration state
1921  */
1922 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1923 {
1924     int ret;
1925     unsigned long *bitmap, *unsentmap;
1926
1927     rcu_read_lock();
1928
1929     /* This should be our last sync, the src is now paused */
1930     migration_bitmap_sync(&ram_state);
1931
1932     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1933     if (!unsentmap) {
1934         /* We don't have a safe way to resize the sentmap, so
1935          * if the bitmap was resized it will be NULL at this
1936          * point.
1937          */
1938         error_report("migration ram resized during precopy phase");
1939         rcu_read_unlock();
1940         return -EINVAL;
1941     }
1942
1943     /* Deal with TPS != HPS and huge pages */
1944     ret = postcopy_chunk_hostpages(ms);
1945     if (ret) {
1946         rcu_read_unlock();
1947         return ret;
1948     }
1949
1950     /*
1951      * Update the unsentmap to be unsentmap = unsentmap | dirty
1952      */
1953     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1954     bitmap_or(unsentmap, unsentmap, bitmap,
1955                last_ram_offset() >> TARGET_PAGE_BITS);
1956
1957
1958     trace_ram_postcopy_send_discard_bitmap();
1959 #ifdef DEBUG_POSTCOPY
1960     ram_debug_dump_bitmap(unsentmap, true);
1961 #endif
1962
1963     ret = postcopy_each_ram_send_discard(ms);
1964     rcu_read_unlock();
1965
1966     return ret;
1967 }
1968
1969 /**
1970  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1971  *
1972  * Returns zero on success
1973  *
1974  * @mis: current migration incoming state
1975  * @rbname: name of the RAMBlock of the request. NULL means the
1976  *          same that last one.
1977  * @start: RAMBlock starting page
1978  * @length: RAMBlock size
1979  */
1980 int ram_discard_range(MigrationIncomingState *mis,
1981                       const char *rbname,
1982                       uint64_t start, size_t length)
1983 {
1984     int ret = -1;
1985
1986     trace_ram_discard_range(rbname, start, length);
1987
1988     rcu_read_lock();
1989     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1990
1991     if (!rb) {
1992         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1993         goto err;
1994     }
1995
1996     ret = ram_block_discard_range(rb, start, length);
1997
1998 err:
1999     rcu_read_unlock();
2000
2001     return ret;
2002 }
2003
2004 static int ram_save_init_globals(RAMState *rs)
2005 {
2006     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
2007
2008     rs->dirty_rate_high_cnt = 0;
2009     rs->bitmap_sync_count = 0;
2010     migration_bitmap_sync_init(rs);
2011     qemu_mutex_init(&migration_bitmap_mutex);
2012
2013     if (migrate_use_xbzrle()) {
2014         XBZRLE_cache_lock();
2015         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
2016         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2017                                   TARGET_PAGE_SIZE,
2018                                   TARGET_PAGE_SIZE);
2019         if (!XBZRLE.cache) {
2020             XBZRLE_cache_unlock();
2021             error_report("Error creating cache");
2022             return -1;
2023         }
2024         XBZRLE_cache_unlock();
2025
2026         /* We prefer not to abort if there is no memory */
2027         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2028         if (!XBZRLE.encoded_buf) {
2029             error_report("Error allocating encoded_buf");
2030             return -1;
2031         }
2032
2033         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2034         if (!XBZRLE.current_buf) {
2035             error_report("Error allocating current_buf");
2036             g_free(XBZRLE.encoded_buf);
2037             XBZRLE.encoded_buf = NULL;
2038             return -1;
2039         }
2040
2041         acct_clear();
2042     }
2043
2044     /* For memory_global_dirty_log_start below.  */
2045     qemu_mutex_lock_iothread();
2046
2047     qemu_mutex_lock_ramlist();
2048     rcu_read_lock();
2049     bytes_transferred = 0;
2050     ram_state_reset(rs);
2051
2052     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
2053     /* Skip setting bitmap if there is no RAM */
2054     if (ram_bytes_total()) {
2055         ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2056         migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
2057         bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
2058
2059         if (migrate_postcopy_ram()) {
2060             migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
2061             bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
2062         }
2063     }
2064
2065     /*
2066      * Count the total number of pages used by ram blocks not including any
2067      * gaps due to alignment or unplugs.
2068      */
2069     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2070
2071     memory_global_dirty_log_start();
2072     migration_bitmap_sync(rs);
2073     qemu_mutex_unlock_ramlist();
2074     qemu_mutex_unlock_iothread();
2075     rcu_read_unlock();
2076
2077     return 0;
2078 }
2079
2080 /*
2081  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2082  * long-running RCU critical section.  When rcu-reclaims in the code
2083  * start to become numerous it will be necessary to reduce the
2084  * granularity of these critical sections.
2085  */
2086
2087 /**
2088  * ram_save_setup: Setup RAM for migration
2089  *
2090  * Returns zero to indicate success and negative for error
2091  *
2092  * @f: QEMUFile where to send the data
2093  * @opaque: RAMState pointer
2094  */
2095 static int ram_save_setup(QEMUFile *f, void *opaque)
2096 {
2097     RAMState *rs = opaque;
2098     RAMBlock *block;
2099
2100     /* migration has already setup the bitmap, reuse it. */
2101     if (!migration_in_colo_state()) {
2102         if (ram_save_init_globals(rs) < 0) {
2103             return -1;
2104          }
2105     }
2106
2107     rcu_read_lock();
2108
2109     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2110
2111     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2112         qemu_put_byte(f, strlen(block->idstr));
2113         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2114         qemu_put_be64(f, block->used_length);
2115         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2116             qemu_put_be64(f, block->page_size);
2117         }
2118     }
2119
2120     rcu_read_unlock();
2121
2122     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2123     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2124
2125     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2126
2127     return 0;
2128 }
2129
2130 /**
2131  * ram_save_iterate: iterative stage for migration
2132  *
2133  * Returns zero to indicate success and negative for error
2134  *
2135  * @f: QEMUFile where to send the data
2136  * @opaque: RAMState pointer
2137  */
2138 static int ram_save_iterate(QEMUFile *f, void *opaque)
2139 {
2140     RAMState *rs = opaque;
2141     int ret;
2142     int i;
2143     int64_t t0;
2144     int done = 0;
2145
2146     rcu_read_lock();
2147     if (ram_list.version != rs->last_version) {
2148         ram_state_reset(rs);
2149     }
2150
2151     /* Read version before ram_list.blocks */
2152     smp_rmb();
2153
2154     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2155
2156     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2157     i = 0;
2158     while ((ret = qemu_file_rate_limit(f)) == 0) {
2159         int pages;
2160
2161         pages = ram_find_and_save_block(rs, f, false, &bytes_transferred);
2162         /* no more pages to sent */
2163         if (pages == 0) {
2164             done = 1;
2165             break;
2166         }
2167         acct_info.iterations++;
2168
2169         /* we want to check in the 1st loop, just in case it was the 1st time
2170            and we had to sync the dirty bitmap.
2171            qemu_get_clock_ns() is a bit expensive, so we only check each some
2172            iterations
2173         */
2174         if ((i & 63) == 0) {
2175             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2176             if (t1 > MAX_WAIT) {
2177                 trace_ram_save_iterate_big_wait(t1, i);
2178                 break;
2179             }
2180         }
2181         i++;
2182     }
2183     flush_compressed_data(f);
2184     rcu_read_unlock();
2185
2186     /*
2187      * Must occur before EOS (or any QEMUFile operation)
2188      * because of RDMA protocol.
2189      */
2190     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2191
2192     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2193     bytes_transferred += 8;
2194
2195     ret = qemu_file_get_error(f);
2196     if (ret < 0) {
2197         return ret;
2198     }
2199
2200     return done;
2201 }
2202
2203 /**
2204  * ram_save_complete: function called to send the remaining amount of ram
2205  *
2206  * Returns zero to indicate success
2207  *
2208  * Called with iothread lock
2209  *
2210  * @f: QEMUFile where to send the data
2211  * @opaque: RAMState pointer
2212  */
2213 static int ram_save_complete(QEMUFile *f, void *opaque)
2214 {
2215     RAMState *rs = opaque;
2216
2217     rcu_read_lock();
2218
2219     if (!migration_in_postcopy(migrate_get_current())) {
2220         migration_bitmap_sync(rs);
2221     }
2222
2223     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2224
2225     /* try transferring iterative blocks of memory */
2226
2227     /* flush all remaining blocks regardless of rate limiting */
2228     while (true) {
2229         int pages;
2230
2231         pages = ram_find_and_save_block(rs, f, !migration_in_colo_state(),
2232                                         &bytes_transferred);
2233         /* no more blocks to sent */
2234         if (pages == 0) {
2235             break;
2236         }
2237     }
2238
2239     flush_compressed_data(f);
2240     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2241
2242     rcu_read_unlock();
2243
2244     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2245
2246     return 0;
2247 }
2248
2249 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2250                              uint64_t *non_postcopiable_pending,
2251                              uint64_t *postcopiable_pending)
2252 {
2253     RAMState *rs = opaque;
2254     uint64_t remaining_size;
2255
2256     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2257
2258     if (!migration_in_postcopy(migrate_get_current()) &&
2259         remaining_size < max_size) {
2260         qemu_mutex_lock_iothread();
2261         rcu_read_lock();
2262         migration_bitmap_sync(rs);
2263         rcu_read_unlock();
2264         qemu_mutex_unlock_iothread();
2265         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2266     }
2267
2268     /* We can do postcopy, and all the data is postcopiable */
2269     *postcopiable_pending += remaining_size;
2270 }
2271
2272 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2273 {
2274     unsigned int xh_len;
2275     int xh_flags;
2276     uint8_t *loaded_data;
2277
2278     if (!xbzrle_decoded_buf) {
2279         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2280     }
2281     loaded_data = xbzrle_decoded_buf;
2282
2283     /* extract RLE header */
2284     xh_flags = qemu_get_byte(f);
2285     xh_len = qemu_get_be16(f);
2286
2287     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2288         error_report("Failed to load XBZRLE page - wrong compression!");
2289         return -1;
2290     }
2291
2292     if (xh_len > TARGET_PAGE_SIZE) {
2293         error_report("Failed to load XBZRLE page - len overflow!");
2294         return -1;
2295     }
2296     /* load data and decode */
2297     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2298
2299     /* decode RLE */
2300     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2301                              TARGET_PAGE_SIZE) == -1) {
2302         error_report("Failed to load XBZRLE page - decode error!");
2303         return -1;
2304     }
2305
2306     return 0;
2307 }
2308
2309 /**
2310  * ram_block_from_stream: read a RAMBlock id from the migration stream
2311  *
2312  * Must be called from within a rcu critical section.
2313  *
2314  * Returns a pointer from within the RCU-protected ram_list.
2315  *
2316  * @f: QEMUFile where to read the data from
2317  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2318  */
2319 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2320 {
2321     static RAMBlock *block = NULL;
2322     char id[256];
2323     uint8_t len;
2324
2325     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2326         if (!block) {
2327             error_report("Ack, bad migration stream!");
2328             return NULL;
2329         }
2330         return block;
2331     }
2332
2333     len = qemu_get_byte(f);
2334     qemu_get_buffer(f, (uint8_t *)id, len);
2335     id[len] = 0;
2336
2337     block = qemu_ram_block_by_name(id);
2338     if (!block) {
2339         error_report("Can't find block %s", id);
2340         return NULL;
2341     }
2342
2343     return block;
2344 }
2345
2346 static inline void *host_from_ram_block_offset(RAMBlock *block,
2347                                                ram_addr_t offset)
2348 {
2349     if (!offset_in_ramblock(block, offset)) {
2350         return NULL;
2351     }
2352
2353     return block->host + offset;
2354 }
2355
2356 /**
2357  * ram_handle_compressed: handle the zero page case
2358  *
2359  * If a page (or a whole RDMA chunk) has been
2360  * determined to be zero, then zap it.
2361  *
2362  * @host: host address for the zero page
2363  * @ch: what the page is filled from.  We only support zero
2364  * @size: size of the zero page
2365  */
2366 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2367 {
2368     if (ch != 0 || !is_zero_range(host, size)) {
2369         memset(host, ch, size);
2370     }
2371 }
2372
2373 static void *do_data_decompress(void *opaque)
2374 {
2375     DecompressParam *param = opaque;
2376     unsigned long pagesize;
2377     uint8_t *des;
2378     int len;
2379
2380     qemu_mutex_lock(&param->mutex);
2381     while (!param->quit) {
2382         if (param->des) {
2383             des = param->des;
2384             len = param->len;
2385             param->des = 0;
2386             qemu_mutex_unlock(&param->mutex);
2387
2388             pagesize = TARGET_PAGE_SIZE;
2389             /* uncompress() will return failed in some case, especially
2390              * when the page is dirted when doing the compression, it's
2391              * not a problem because the dirty page will be retransferred
2392              * and uncompress() won't break the data in other pages.
2393              */
2394             uncompress((Bytef *)des, &pagesize,
2395                        (const Bytef *)param->compbuf, len);
2396
2397             qemu_mutex_lock(&decomp_done_lock);
2398             param->done = true;
2399             qemu_cond_signal(&decomp_done_cond);
2400             qemu_mutex_unlock(&decomp_done_lock);
2401
2402             qemu_mutex_lock(&param->mutex);
2403         } else {
2404             qemu_cond_wait(&param->cond, &param->mutex);
2405         }
2406     }
2407     qemu_mutex_unlock(&param->mutex);
2408
2409     return NULL;
2410 }
2411
2412 static void wait_for_decompress_done(void)
2413 {
2414     int idx, thread_count;
2415
2416     if (!migrate_use_compression()) {
2417         return;
2418     }
2419
2420     thread_count = migrate_decompress_threads();
2421     qemu_mutex_lock(&decomp_done_lock);
2422     for (idx = 0; idx < thread_count; idx++) {
2423         while (!decomp_param[idx].done) {
2424             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2425         }
2426     }
2427     qemu_mutex_unlock(&decomp_done_lock);
2428 }
2429
2430 void migrate_decompress_threads_create(void)
2431 {
2432     int i, thread_count;
2433
2434     thread_count = migrate_decompress_threads();
2435     decompress_threads = g_new0(QemuThread, thread_count);
2436     decomp_param = g_new0(DecompressParam, thread_count);
2437     qemu_mutex_init(&decomp_done_lock);
2438     qemu_cond_init(&decomp_done_cond);
2439     for (i = 0; i < thread_count; i++) {
2440         qemu_mutex_init(&decomp_param[i].mutex);
2441         qemu_cond_init(&decomp_param[i].cond);
2442         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2443         decomp_param[i].done = true;
2444         decomp_param[i].quit = false;
2445         qemu_thread_create(decompress_threads + i, "decompress",
2446                            do_data_decompress, decomp_param + i,
2447                            QEMU_THREAD_JOINABLE);
2448     }
2449 }
2450
2451 void migrate_decompress_threads_join(void)
2452 {
2453     int i, thread_count;
2454
2455     thread_count = migrate_decompress_threads();
2456     for (i = 0; i < thread_count; i++) {
2457         qemu_mutex_lock(&decomp_param[i].mutex);
2458         decomp_param[i].quit = true;
2459         qemu_cond_signal(&decomp_param[i].cond);
2460         qemu_mutex_unlock(&decomp_param[i].mutex);
2461     }
2462     for (i = 0; i < thread_count; i++) {
2463         qemu_thread_join(decompress_threads + i);
2464         qemu_mutex_destroy(&decomp_param[i].mutex);
2465         qemu_cond_destroy(&decomp_param[i].cond);
2466         g_free(decomp_param[i].compbuf);
2467     }
2468     g_free(decompress_threads);
2469     g_free(decomp_param);
2470     decompress_threads = NULL;
2471     decomp_param = NULL;
2472 }
2473
2474 static void decompress_data_with_multi_threads(QEMUFile *f,
2475                                                void *host, int len)
2476 {
2477     int idx, thread_count;
2478
2479     thread_count = migrate_decompress_threads();
2480     qemu_mutex_lock(&decomp_done_lock);
2481     while (true) {
2482         for (idx = 0; idx < thread_count; idx++) {
2483             if (decomp_param[idx].done) {
2484                 decomp_param[idx].done = false;
2485                 qemu_mutex_lock(&decomp_param[idx].mutex);
2486                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2487                 decomp_param[idx].des = host;
2488                 decomp_param[idx].len = len;
2489                 qemu_cond_signal(&decomp_param[idx].cond);
2490                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2491                 break;
2492             }
2493         }
2494         if (idx < thread_count) {
2495             break;
2496         } else {
2497             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2498         }
2499     }
2500     qemu_mutex_unlock(&decomp_done_lock);
2501 }
2502
2503 /**
2504  * ram_postcopy_incoming_init: allocate postcopy data structures
2505  *
2506  * Returns 0 for success and negative if there was one error
2507  *
2508  * @mis: current migration incoming state
2509  *
2510  * Allocate data structures etc needed by incoming migration with
2511  * postcopy-ram. postcopy-ram's similarly names
2512  * postcopy_ram_incoming_init does the work.
2513  */
2514 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2515 {
2516     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2517
2518     return postcopy_ram_incoming_init(mis, ram_pages);
2519 }
2520
2521 /**
2522  * ram_load_postcopy: load a page in postcopy case
2523  *
2524  * Returns 0 for success or -errno in case of error
2525  *
2526  * Called in postcopy mode by ram_load().
2527  * rcu_read_lock is taken prior to this being called.
2528  *
2529  * @f: QEMUFile where to send the data
2530  */
2531 static int ram_load_postcopy(QEMUFile *f)
2532 {
2533     int flags = 0, ret = 0;
2534     bool place_needed = false;
2535     bool matching_page_sizes = false;
2536     MigrationIncomingState *mis = migration_incoming_get_current();
2537     /* Temporary page that is later 'placed' */
2538     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2539     void *last_host = NULL;
2540     bool all_zero = false;
2541
2542     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2543         ram_addr_t addr;
2544         void *host = NULL;
2545         void *page_buffer = NULL;
2546         void *place_source = NULL;
2547         RAMBlock *block = NULL;
2548         uint8_t ch;
2549
2550         addr = qemu_get_be64(f);
2551         flags = addr & ~TARGET_PAGE_MASK;
2552         addr &= TARGET_PAGE_MASK;
2553
2554         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2555         place_needed = false;
2556         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2557             block = ram_block_from_stream(f, flags);
2558
2559             host = host_from_ram_block_offset(block, addr);
2560             if (!host) {
2561                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2562                 ret = -EINVAL;
2563                 break;
2564             }
2565             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2566             /*
2567              * Postcopy requires that we place whole host pages atomically;
2568              * these may be huge pages for RAMBlocks that are backed by
2569              * hugetlbfs.
2570              * To make it atomic, the data is read into a temporary page
2571              * that's moved into place later.
2572              * The migration protocol uses,  possibly smaller, target-pages
2573              * however the source ensures it always sends all the components
2574              * of a host page in order.
2575              */
2576             page_buffer = postcopy_host_page +
2577                           ((uintptr_t)host & (block->page_size - 1));
2578             /* If all TP are zero then we can optimise the place */
2579             if (!((uintptr_t)host & (block->page_size - 1))) {
2580                 all_zero = true;
2581             } else {
2582                 /* not the 1st TP within the HP */
2583                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2584                     error_report("Non-sequential target page %p/%p",
2585                                   host, last_host);
2586                     ret = -EINVAL;
2587                     break;
2588                 }
2589             }
2590
2591
2592             /*
2593              * If it's the last part of a host page then we place the host
2594              * page
2595              */
2596             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2597                                      (block->page_size - 1)) == 0;
2598             place_source = postcopy_host_page;
2599         }
2600         last_host = host;
2601
2602         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2603         case RAM_SAVE_FLAG_COMPRESS:
2604             ch = qemu_get_byte(f);
2605             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2606             if (ch) {
2607                 all_zero = false;
2608             }
2609             break;
2610
2611         case RAM_SAVE_FLAG_PAGE:
2612             all_zero = false;
2613             if (!place_needed || !matching_page_sizes) {
2614                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2615             } else {
2616                 /* Avoids the qemu_file copy during postcopy, which is
2617                  * going to do a copy later; can only do it when we
2618                  * do this read in one go (matching page sizes)
2619                  */
2620                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2621                                          TARGET_PAGE_SIZE);
2622             }
2623             break;
2624         case RAM_SAVE_FLAG_EOS:
2625             /* normal exit */
2626             break;
2627         default:
2628             error_report("Unknown combination of migration flags: %#x"
2629                          " (postcopy mode)", flags);
2630             ret = -EINVAL;
2631         }
2632
2633         if (place_needed) {
2634             /* This gets called at the last target page in the host page */
2635             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2636
2637             if (all_zero) {
2638                 ret = postcopy_place_page_zero(mis, place_dest,
2639                                                block->page_size);
2640             } else {
2641                 ret = postcopy_place_page(mis, place_dest,
2642                                           place_source, block->page_size);
2643             }
2644         }
2645         if (!ret) {
2646             ret = qemu_file_get_error(f);
2647         }
2648     }
2649
2650     return ret;
2651 }
2652
2653 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2654 {
2655     int flags = 0, ret = 0;
2656     static uint64_t seq_iter;
2657     int len = 0;
2658     /*
2659      * If system is running in postcopy mode, page inserts to host memory must
2660      * be atomic
2661      */
2662     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2663     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2664     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2665
2666     seq_iter++;
2667
2668     if (version_id != 4) {
2669         ret = -EINVAL;
2670     }
2671
2672     /* This RCU critical section can be very long running.
2673      * When RCU reclaims in the code start to become numerous,
2674      * it will be necessary to reduce the granularity of this
2675      * critical section.
2676      */
2677     rcu_read_lock();
2678
2679     if (postcopy_running) {
2680         ret = ram_load_postcopy(f);
2681     }
2682
2683     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2684         ram_addr_t addr, total_ram_bytes;
2685         void *host = NULL;
2686         uint8_t ch;
2687
2688         addr = qemu_get_be64(f);
2689         flags = addr & ~TARGET_PAGE_MASK;
2690         addr &= TARGET_PAGE_MASK;
2691
2692         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2693                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2694             RAMBlock *block = ram_block_from_stream(f, flags);
2695
2696             host = host_from_ram_block_offset(block, addr);
2697             if (!host) {
2698                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2699                 ret = -EINVAL;
2700                 break;
2701             }
2702         }
2703
2704         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2705         case RAM_SAVE_FLAG_MEM_SIZE:
2706             /* Synchronize RAM block list */
2707             total_ram_bytes = addr;
2708             while (!ret && total_ram_bytes) {
2709                 RAMBlock *block;
2710                 char id[256];
2711                 ram_addr_t length;
2712
2713                 len = qemu_get_byte(f);
2714                 qemu_get_buffer(f, (uint8_t *)id, len);
2715                 id[len] = 0;
2716                 length = qemu_get_be64(f);
2717
2718                 block = qemu_ram_block_by_name(id);
2719                 if (block) {
2720                     if (length != block->used_length) {
2721                         Error *local_err = NULL;
2722
2723                         ret = qemu_ram_resize(block, length,
2724                                               &local_err);
2725                         if (local_err) {
2726                             error_report_err(local_err);
2727                         }
2728                     }
2729                     /* For postcopy we need to check hugepage sizes match */
2730                     if (postcopy_advised &&
2731                         block->page_size != qemu_host_page_size) {
2732                         uint64_t remote_page_size = qemu_get_be64(f);
2733                         if (remote_page_size != block->page_size) {
2734                             error_report("Mismatched RAM page size %s "
2735                                          "(local) %zd != %" PRId64,
2736                                          id, block->page_size,
2737                                          remote_page_size);
2738                             ret = -EINVAL;
2739                         }
2740                     }
2741                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2742                                           block->idstr);
2743                 } else {
2744                     error_report("Unknown ramblock \"%s\", cannot "
2745                                  "accept migration", id);
2746                     ret = -EINVAL;
2747                 }
2748
2749                 total_ram_bytes -= length;
2750             }
2751             break;
2752
2753         case RAM_SAVE_FLAG_COMPRESS:
2754             ch = qemu_get_byte(f);
2755             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2756             break;
2757
2758         case RAM_SAVE_FLAG_PAGE:
2759             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2760             break;
2761
2762         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2763             len = qemu_get_be32(f);
2764             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2765                 error_report("Invalid compressed data length: %d", len);
2766                 ret = -EINVAL;
2767                 break;
2768             }
2769             decompress_data_with_multi_threads(f, host, len);
2770             break;
2771
2772         case RAM_SAVE_FLAG_XBZRLE:
2773             if (load_xbzrle(f, addr, host) < 0) {
2774                 error_report("Failed to decompress XBZRLE page at "
2775                              RAM_ADDR_FMT, addr);
2776                 ret = -EINVAL;
2777                 break;
2778             }
2779             break;
2780         case RAM_SAVE_FLAG_EOS:
2781             /* normal exit */
2782             break;
2783         default:
2784             if (flags & RAM_SAVE_FLAG_HOOK) {
2785                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2786             } else {
2787                 error_report("Unknown combination of migration flags: %#x",
2788                              flags);
2789                 ret = -EINVAL;
2790             }
2791         }
2792         if (!ret) {
2793             ret = qemu_file_get_error(f);
2794         }
2795     }
2796
2797     wait_for_decompress_done();
2798     rcu_read_unlock();
2799     trace_ram_load_complete(ret, seq_iter);
2800     return ret;
2801 }
2802
2803 static SaveVMHandlers savevm_ram_handlers = {
2804     .save_live_setup = ram_save_setup,
2805     .save_live_iterate = ram_save_iterate,
2806     .save_live_complete_postcopy = ram_save_complete,
2807     .save_live_complete_precopy = ram_save_complete,
2808     .save_live_pending = ram_save_pending,
2809     .load_state = ram_load,
2810     .cleanup = ram_migration_cleanup,
2811 };
2812
2813 void ram_mig_init(void)
2814 {
2815     qemu_mutex_init(&XBZRLE.lock);
2816     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2817 }