migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <[email protected]>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "migration/postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46 #include "migration/colo.h"
  47
  48 /***********************************************************/
  49 /* ram save/restore */
  50
  51 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  52 #define RAM_SAVE_FLAG_COMPRESS 0x02
  53 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  54 #define RAM_SAVE_FLAG_PAGE     0x08
  55 #define RAM_SAVE_FLAG_EOS      0x10
  56 #define RAM_SAVE_FLAG_CONTINUE 0x20
  57 #define RAM_SAVE_FLAG_XBZRLE   0x40
  58 /* 0x80 is reserved in migration.h start with 0x100 next */
  59 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  60
  61 static uint8_t *ZERO_TARGET_PAGE;
  62
  63 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  64 {
  65     return buffer_is_zero(p, size);
  66 }
  67
  68 /* struct contains XBZRLE cache and a static page
  69    used by the compression */
  70 static struct {
  71     /* buffer used for XBZRLE encoding */
  72     uint8_t *encoded_buf;
  73     /* buffer for storing page content */
  74     uint8_t *current_buf;
  75     /* Cache for XBZRLE, Protected by lock. */
  76     PageCache *cache;
  77     QemuMutex lock;
  78 } XBZRLE;
  79
  80 /* buffer used for XBZRLE decoding */
  81 static uint8_t *xbzrle_decoded_buf;
  82
  83 static void XBZRLE_cache_lock(void)
  84 {
  85     if (migrate_use_xbzrle())
  86         qemu_mutex_lock(&XBZRLE.lock);
  87 }
  88
  89 static void XBZRLE_cache_unlock(void)
  90 {
  91     if (migrate_use_xbzrle())
  92         qemu_mutex_unlock(&XBZRLE.lock);
  93 }
  94
  95 /**
  96  * xbzrle_cache_resize: resize the xbzrle cache
  97  *
  98  * This function is called from qmp_migrate_set_cache_size in main
  99  * thread, possibly while a migration is in progress.  A running
 100  * migration may be using the cache and might finish during this call,
 101  * hence changes to the cache are protected by XBZRLE.lock().
 102  *
 103  * Returns the new_size or negative in case of error.
 104  *
 105  * @new_size: new cache size
 106  */
 107 int64_t xbzrle_cache_resize(int64_t new_size)
 108 {
 109     PageCache *new_cache;
 110     int64_t ret;
 111
 112     if (new_size < TARGET_PAGE_SIZE) {
 113         return -1;
 114     }
 115
 116     XBZRLE_cache_lock();
 117
 118     if (XBZRLE.cache != NULL) {
 119         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 120             goto out_new_size;
 121         }
 122         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 123                                         TARGET_PAGE_SIZE);
 124         if (!new_cache) {
 125             error_report("Error creating cache");
 126             ret = -1;
 127             goto out;
 128         }
 129
 130         cache_fini(XBZRLE.cache);
 131         XBZRLE.cache = new_cache;
 132     }
 133
 134 out_new_size:
 135     ret = pow2floor(new_size);
 136 out:
 137     XBZRLE_cache_unlock();
 138     return ret;
 139 }
 140
 141 /* State of RAM for migration */
 142 struct RAMState {
 143     /* Last block that we have visited searching for dirty pages */
 144     RAMBlock *last_seen_block;
 145     /* Last block from where we have sent data */
 146     RAMBlock *last_sent_block;
 147     /* Last offset we have sent data from */
 148     ram_addr_t last_offset;
 149     /* last ram version we have seen */
 150     uint32_t last_version;
 151     /* We are in the first round */
 152     bool ram_bulk_stage;
 153     /* How many times we have dirty too many pages */
 154     int dirty_rate_high_cnt;
 155     /* How many times we have synchronized the bitmap */
 156     uint64_t bitmap_sync_count;
 157     /* these variables are used for bitmap sync */
 158     /* last time we did a full bitmap_sync */
 159     int64_t time_last_bitmap_sync;
 160     /* bytes transferred at start_time */
 161     uint64_t bytes_xfer_prev;
 162     /* number of dirty pages since start_time */
 163     uint64_t num_dirty_pages_period;
 164     /* xbzrle misses since the beginning of the period */
 165     uint64_t xbzrle_cache_miss_prev;
 166     /* number of iterations at the beginning of period */
 167     uint64_t iterations_prev;
 168     /* Accounting fields */
 169     /* number of zero pages.  It used to be pages filled by the same char. */
 170     uint64_t zero_pages;
 171     /* number of normal transferred pages */
 172     uint64_t norm_pages;
 173     /* Iterations since start */
 174     uint64_t iterations;
 175     /* xbzrle transmitted bytes.  Notice that this is with
 176      * compression, they can't be calculated from the pages */
 177     uint64_t xbzrle_bytes;
 178     /* xbzrle transmmited pages */
 179     uint64_t xbzrle_pages;
 180     /* xbzrle number of cache miss */
 181     uint64_t xbzrle_cache_miss;
 182     /* xbzrle miss rate */
 183     double xbzrle_cache_miss_rate;
 184 };
 185 typedef struct RAMState RAMState;
 186
 187 static RAMState ram_state;
 188
 189 /* accounting for migration statistics */
 190 typedef struct AccountingInfo {
 191     uint64_t xbzrle_overflows;
 192 } AccountingInfo;
 193
 194 static AccountingInfo acct_info;
 195
 196 static void acct_clear(void)
 197 {
 198     memset(&acct_info, 0, sizeof(acct_info));
 199 }
 200
 201 uint64_t dup_mig_pages_transferred(void)
 202 {
 203     return ram_state.zero_pages;
 204 }
 205
 206 uint64_t norm_mig_pages_transferred(void)
 207 {
 208     return ram_state.norm_pages;
 209 }
 210
 211 uint64_t xbzrle_mig_bytes_transferred(void)
 212 {
 213     return ram_state.xbzrle_bytes;
 214 }
 215
 216 uint64_t xbzrle_mig_pages_transferred(void)
 217 {
 218     return ram_state.xbzrle_pages;
 219 }
 220
 221 uint64_t xbzrle_mig_pages_cache_miss(void)
 222 {
 223     return ram_state.xbzrle_cache_miss;
 224 }
 225
 226 double xbzrle_mig_cache_miss_rate(void)
 227 {
 228     return ram_state.xbzrle_cache_miss_rate;
 229 }
 230
 231 uint64_t xbzrle_mig_pages_overflow(void)
 232 {
 233     return acct_info.xbzrle_overflows;
 234 }
 235
 236 static QemuMutex migration_bitmap_mutex;
 237 static uint64_t migration_dirty_pages;
 238
 239 /* used by the search for pages to send */
 240 struct PageSearchStatus {
 241     /* Current block being searched */
 242     RAMBlock    *block;
 243     /* Current offset to search from */
 244     ram_addr_t   offset;
 245     /* Set once we wrap around */
 246     bool         complete_round;
 247 };
 248 typedef struct PageSearchStatus PageSearchStatus;
 249
 250 static struct BitmapRcu {
 251     struct rcu_head rcu;
 252     /* Main migration bitmap */
 253     unsigned long *bmap;
 254     /* bitmap of pages that haven't been sent even once
 255      * only maintained and used in postcopy at the moment
 256      * where it's used to send the dirtymap at the start
 257      * of the postcopy phase
 258      */
 259     unsigned long *unsentmap;
 260 } *migration_bitmap_rcu;
 261
 262 struct CompressParam {
 263     bool done;
 264     bool quit;
 265     QEMUFile *file;
 266     QemuMutex mutex;
 267     QemuCond cond;
 268     RAMBlock *block;
 269     ram_addr_t offset;
 270 };
 271 typedef struct CompressParam CompressParam;
 272
 273 struct DecompressParam {
 274     bool done;
 275     bool quit;
 276     QemuMutex mutex;
 277     QemuCond cond;
 278     void *des;
 279     uint8_t *compbuf;
 280     int len;
 281 };
 282 typedef struct DecompressParam DecompressParam;
 283
 284 static CompressParam *comp_param;
 285 static QemuThread *compress_threads;
 286 /* comp_done_cond is used to wake up the migration thread when
 287  * one of the compression threads has finished the compression.
 288  * comp_done_lock is used to co-work with comp_done_cond.
 289  */
 290 static QemuMutex comp_done_lock;
 291 static QemuCond comp_done_cond;
 292 /* The empty QEMUFileOps will be used by file in CompressParam */
 293 static const QEMUFileOps empty_ops = { };
 294
 295 static bool compression_switch;
 296 static DecompressParam *decomp_param;
 297 static QemuThread *decompress_threads;
 298 static QemuMutex decomp_done_lock;
 299 static QemuCond decomp_done_cond;
 300
 301 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 302                                 ram_addr_t offset);
 303
 304 static void *do_data_compress(void *opaque)
 305 {
 306     CompressParam *param = opaque;
 307     RAMBlock *block;
 308     ram_addr_t offset;
 309
 310     qemu_mutex_lock(&param->mutex);
 311     while (!param->quit) {
 312         if (param->block) {
 313             block = param->block;
 314             offset = param->offset;
 315             param->block = NULL;
 316             qemu_mutex_unlock(&param->mutex);
 317
 318             do_compress_ram_page(param->file, block, offset);
 319
 320             qemu_mutex_lock(&comp_done_lock);
 321             param->done = true;
 322             qemu_cond_signal(&comp_done_cond);
 323             qemu_mutex_unlock(&comp_done_lock);
 324
 325             qemu_mutex_lock(&param->mutex);
 326         } else {
 327             qemu_cond_wait(&param->cond, &param->mutex);
 328         }
 329     }
 330     qemu_mutex_unlock(&param->mutex);
 331
 332     return NULL;
 333 }
 334
 335 static inline void terminate_compression_threads(void)
 336 {
 337     int idx, thread_count;
 338
 339     thread_count = migrate_compress_threads();
 340
 341     for (idx = 0; idx < thread_count; idx++) {
 342         qemu_mutex_lock(&comp_param[idx].mutex);
 343         comp_param[idx].quit = true;
 344         qemu_cond_signal(&comp_param[idx].cond);
 345         qemu_mutex_unlock(&comp_param[idx].mutex);
 346     }
 347 }
 348
 349 void migrate_compress_threads_join(void)
 350 {
 351     int i, thread_count;
 352
 353     if (!migrate_use_compression()) {
 354         return;
 355     }
 356     terminate_compression_threads();
 357     thread_count = migrate_compress_threads();
 358     for (i = 0; i < thread_count; i++) {
 359         qemu_thread_join(compress_threads + i);
 360         qemu_fclose(comp_param[i].file);
 361         qemu_mutex_destroy(&comp_param[i].mutex);
 362         qemu_cond_destroy(&comp_param[i].cond);
 363     }
 364     qemu_mutex_destroy(&comp_done_lock);
 365     qemu_cond_destroy(&comp_done_cond);
 366     g_free(compress_threads);
 367     g_free(comp_param);
 368     compress_threads = NULL;
 369     comp_param = NULL;
 370 }
 371
 372 void migrate_compress_threads_create(void)
 373 {
 374     int i, thread_count;
 375
 376     if (!migrate_use_compression()) {
 377         return;
 378     }
 379     compression_switch = true;
 380     thread_count = migrate_compress_threads();
 381     compress_threads = g_new0(QemuThread, thread_count);
 382     comp_param = g_new0(CompressParam, thread_count);
 383     qemu_cond_init(&comp_done_cond);
 384     qemu_mutex_init(&comp_done_lock);
 385     for (i = 0; i < thread_count; i++) {
 386         /* comp_param[i].file is just used as a dummy buffer to save data,
 387          * set its ops to empty.
 388          */
 389         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 390         comp_param[i].done = true;
 391         comp_param[i].quit = false;
 392         qemu_mutex_init(&comp_param[i].mutex);
 393         qemu_cond_init(&comp_param[i].cond);
 394         qemu_thread_create(compress_threads + i, "compress",
 395                            do_data_compress, comp_param + i,
 396                            QEMU_THREAD_JOINABLE);
 397     }
 398 }
 399
 400 /**
 401  * save_page_header: write page header to wire
 402  *
 403  * If this is the 1st block, it also writes the block identification
 404  *
 405  * Returns the number of bytes written
 406  *
 407  * @f: QEMUFile where to send the data
 408  * @block: block that contains the page we want to send
 409  * @offset: offset inside the block for the page
 410  *          in the lower bits, it contains flags
 411  */
 412 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 413 {
 414     size_t size, len;
 415
 416     qemu_put_be64(f, offset);
 417     size = 8;
 418
 419     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 420         len = strlen(block->idstr);
 421         qemu_put_byte(f, len);
 422         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 423         size += 1 + len;
 424     }
 425     return size;
 426 }
 427
 428 /**
 429  * mig_throttle_guest_down: throotle down the guest
 430  *
 431  * Reduce amount of guest cpu execution to hopefully slow down memory
 432  * writes. If guest dirty memory rate is reduced below the rate at
 433  * which we can transfer pages to the destination then we should be
 434  * able to complete migration. Some workloads dirty memory way too
 435  * fast and will not effectively converge, even with auto-converge.
 436  */
 437 static void mig_throttle_guest_down(void)
 438 {
 439     MigrationState *s = migrate_get_current();
 440     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 441     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 442
 443     /* We have not started throttling yet. Let's start it. */
 444     if (!cpu_throttle_active()) {
 445         cpu_throttle_set(pct_initial);
 446     } else {
 447         /* Throttling already on, just increase the rate */
 448         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 449     }
 450 }
 451
 452 /**
 453  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 454  *
 455  * @rs: current RAM state
 456  * @current_addr: address for the zero page
 457  *
 458  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 459  * The important thing is that a stale (not-yet-0'd) page be replaced
 460  * by the new data.
 461  * As a bonus, if the page wasn't in the cache it gets added so that
 462  * when a small write is made into the 0'd page it gets XBZRLE sent.
 463  */
 464 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 465 {
 466     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 467         return;
 468     }
 469
 470     /* We don't care if this fails to allocate a new cache page
 471      * as long as it updated an old one */
 472     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 473                  rs->bitmap_sync_count);
 474 }
 475
 476 #define ENCODING_FLAG_XBZRLE 0x1
 477
 478 /**
 479  * save_xbzrle_page: compress and send current page
 480  *
 481  * Returns: 1 means that we wrote the page
 482  *          0 means that page is identical to the one already sent
 483  *          -1 means that xbzrle would be longer than normal
 484  *
 485  * @rs: current RAM state
 486  * @f: QEMUFile where to send the data
 487  * @current_data: pointer to the address of the page contents
 488  * @current_addr: addr of the page
 489  * @block: block that contains the page we want to send
 490  * @offset: offset inside the block for the page
 491  * @last_stage: if we are at the completion stage
 492  * @bytes_transferred: increase it with the number of transferred bytes
 493  */
 494 static int save_xbzrle_page(RAMState *rs, QEMUFile *f, uint8_t **current_data,
 495                             ram_addr_t current_addr, RAMBlock *block,
 496                             ram_addr_t offset, bool last_stage,
 497                             uint64_t *bytes_transferred)
 498 {
 499     int encoded_len = 0, bytes_xbzrle;
 500     uint8_t *prev_cached_page;
 501
 502     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 503         rs->xbzrle_cache_miss++;
 504         if (!last_stage) {
 505             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 506                              rs->bitmap_sync_count) == -1) {
 507                 return -1;
 508             } else {
 509                 /* update *current_data when the page has been
 510                    inserted into cache */
 511                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 512             }
 513         }
 514         return -1;
 515     }
 516
 517     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 518
 519     /* save current buffer into memory */
 520     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 521
 522     /* XBZRLE encoding (if there is no overflow) */
 523     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 524                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 525                                        TARGET_PAGE_SIZE);
 526     if (encoded_len == 0) {
 527         trace_save_xbzrle_page_skipping();
 528         return 0;
 529     } else if (encoded_len == -1) {
 530         trace_save_xbzrle_page_overflow();
 531         acct_info.xbzrle_overflows++;
 532         /* update data in the cache */
 533         if (!last_stage) {
 534             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 535             *current_data = prev_cached_page;
 536         }
 537         return -1;
 538     }
 539
 540     /* we need to update the data in the cache, in order to get the same data */
 541     if (!last_stage) {
 542         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 543     }
 544
 545     /* Send XBZRLE based compressed page */
 546     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 547     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 548     qemu_put_be16(f, encoded_len);
 549     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 550     bytes_xbzrle += encoded_len + 1 + 2;
 551     rs->xbzrle_pages++;
 552     rs->xbzrle_bytes += bytes_xbzrle;
 553     *bytes_transferred += bytes_xbzrle;
 554
 555     return 1;
 556 }
 557
 558 /**
 559  * migration_bitmap_find_dirty: find the next dirty page from start
 560  *
 561  * Called with rcu_read_lock() to protect migration_bitmap
 562  *
 563  * Returns the byte offset within memory region of the start of a dirty page
 564  *
 565  * @rs: current RAM state
 566  * @rb: RAMBlock where to search for dirty pages
 567  * @start: starting address (typically so we can continue from previous page)
 568  * @ram_addr_abs: pointer into which to store the address of the dirty page
 569  *                within the global ram_addr space
 570  */
 571 static inline
 572 ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 573                                        ram_addr_t start,
 574                                        ram_addr_t *ram_addr_abs)
 575 {
 576     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 577     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 578     uint64_t rb_size = rb->used_length;
 579     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 580     unsigned long *bitmap;
 581
 582     unsigned long next;
 583
 584     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 585     if (rs->ram_bulk_stage && nr > base) {
 586         next = nr + 1;
 587     } else {
 588         next = find_next_bit(bitmap, size, nr);
 589     }
 590
 591     *ram_addr_abs = next << TARGET_PAGE_BITS;
 592     return (next - base) << TARGET_PAGE_BITS;
 593 }
 594
 595 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 596 {
 597     bool ret;
 598     int nr = addr >> TARGET_PAGE_BITS;
 599     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 600
 601     ret = test_and_clear_bit(nr, bitmap);
 602
 603     if (ret) {
 604         migration_dirty_pages--;
 605     }
 606     return ret;
 607 }
 608
 609 static void migration_bitmap_sync_range(RAMState *rs, ram_addr_t start,
 610                                         ram_addr_t length)
 611 {
 612     unsigned long *bitmap;
 613     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 614     migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
 615                              start, length, &rs->num_dirty_pages_period);
 616 }
 617
 618 static void migration_bitmap_sync_init(RAMState *rs)
 619 {
 620     rs->time_last_bitmap_sync = 0;
 621     rs->bytes_xfer_prev = 0;
 622     rs->num_dirty_pages_period = 0;
 623     rs->xbzrle_cache_miss_prev = 0;
 624     rs->iterations_prev = 0;
 625 }
 626
 627 /**
 628  * ram_pagesize_summary: calculate all the pagesizes of a VM
 629  *
 630  * Returns a summary bitmap of the page sizes of all RAMBlocks
 631  *
 632  * For VMs with just normal pages this is equivalent to the host page
 633  * size. If it's got some huge pages then it's the OR of all the
 634  * different page sizes.
 635  */
 636 uint64_t ram_pagesize_summary(void)
 637 {
 638     RAMBlock *block;
 639     uint64_t summary = 0;
 640
 641     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 642         summary |= block->page_size;
 643     }
 644
 645     return summary;
 646 }
 647
 648 static void migration_bitmap_sync(RAMState *rs)
 649 {
 650     RAMBlock *block;
 651     MigrationState *s = migrate_get_current();
 652     int64_t end_time;
 653     uint64_t bytes_xfer_now;
 654
 655     rs->bitmap_sync_count++;
 656
 657     if (!rs->bytes_xfer_prev) {
 658         rs->bytes_xfer_prev = ram_bytes_transferred();
 659     }
 660
 661     if (!rs->time_last_bitmap_sync) {
 662         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 663     }
 664
 665     trace_migration_bitmap_sync_start();
 666     memory_global_dirty_log_sync();
 667
 668     qemu_mutex_lock(&migration_bitmap_mutex);
 669     rcu_read_lock();
 670     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 671         migration_bitmap_sync_range(rs, block->offset, block->used_length);
 672     }
 673     rcu_read_unlock();
 674     qemu_mutex_unlock(&migration_bitmap_mutex);
 675
 676     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 677
 678     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 679
 680     /* more than 1 second = 1000 millisecons */
 681     if (end_time > rs->time_last_bitmap_sync + 1000) {
 682         if (migrate_auto_converge()) {
 683             /* The following detection logic can be refined later. For now:
 684                Check to see if the dirtied bytes is 50% more than the approx.
 685                amount of bytes that just got transferred since the last time we
 686                were in this routine. If that happens twice, start or increase
 687                throttling */
 688             bytes_xfer_now = ram_bytes_transferred();
 689
 690             if (s->dirty_pages_rate &&
 691                (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 692                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 693                (rs->dirty_rate_high_cnt++ >= 2)) {
 694                     trace_migration_throttle();
 695                     rs->dirty_rate_high_cnt = 0;
 696                     mig_throttle_guest_down();
 697              }
 698              rs->bytes_xfer_prev = bytes_xfer_now;
 699         }
 700
 701         if (migrate_use_xbzrle()) {
 702             if (rs->iterations_prev != rs->iterations) {
 703                 rs->xbzrle_cache_miss_rate =
 704                    (double)(rs->xbzrle_cache_miss -
 705                             rs->xbzrle_cache_miss_prev) /
 706                    (rs->iterations - rs->iterations_prev);
 707             }
 708             rs->iterations_prev = rs->iterations;
 709             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
 710         }
 711         s->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 712             / (end_time - rs->time_last_bitmap_sync);
 713         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 714         rs->time_last_bitmap_sync = end_time;
 715         rs->num_dirty_pages_period = 0;
 716     }
 717     s->dirty_sync_count = rs->bitmap_sync_count;
 718     if (migrate_use_events()) {
 719         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 720     }
 721 }
 722
 723 /**
 724  * save_zero_page: send the zero page to the stream
 725  *
 726  * Returns the number of pages written.
 727  *
 728  * @rs: current RAM state
 729  * @f: QEMUFile where to send the data
 730  * @block: block that contains the page we want to send
 731  * @offset: offset inside the block for the page
 732  * @p: pointer to the page
 733  * @bytes_transferred: increase it with the number of transferred bytes
 734  */
 735 static int save_zero_page(RAMState *rs, QEMUFile *f, RAMBlock *block,
 736                           ram_addr_t offset,
 737                           uint8_t *p, uint64_t *bytes_transferred)
 738 {
 739     int pages = -1;
 740
 741     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 742         rs->zero_pages++;
 743         *bytes_transferred += save_page_header(f, block,
 744                                                offset | RAM_SAVE_FLAG_COMPRESS);
 745         qemu_put_byte(f, 0);
 746         *bytes_transferred += 1;
 747         pages = 1;
 748     }
 749
 750     return pages;
 751 }
 752
 753 static void ram_release_pages(MigrationState *ms, const char *rbname,
 754                               uint64_t offset, int pages)
 755 {
 756     if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
 757         return;
 758     }
 759
 760     ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
 761 }
 762
 763 /**
 764  * ram_save_page: send the given page to the stream
 765  *
 766  * Returns the number of pages written.
 767  *          < 0 - error
 768  *          >=0 - Number of pages written - this might legally be 0
 769  *                if xbzrle noticed the page was the same.
 770  *
 771  * @rs: current RAM state
 772  * @ms: current migration state
 773  * @f: QEMUFile where to send the data
 774  * @block: block that contains the page we want to send
 775  * @offset: offset inside the block for the page
 776  * @last_stage: if we are at the completion stage
 777  * @bytes_transferred: increase it with the number of transferred bytes
 778  */
 779 static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
 780                          PageSearchStatus *pss, bool last_stage,
 781                          uint64_t *bytes_transferred)
 782 {
 783     int pages = -1;
 784     uint64_t bytes_xmit;
 785     ram_addr_t current_addr;
 786     uint8_t *p;
 787     int ret;
 788     bool send_async = true;
 789     RAMBlock *block = pss->block;
 790     ram_addr_t offset = pss->offset;
 791
 792     p = block->host + offset;
 793
 794     /* In doubt sent page as normal */
 795     bytes_xmit = 0;
 796     ret = ram_control_save_page(f, block->offset,
 797                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 798     if (bytes_xmit) {
 799         *bytes_transferred += bytes_xmit;
 800         pages = 1;
 801     }
 802
 803     XBZRLE_cache_lock();
 804
 805     current_addr = block->offset + offset;
 806
 807     if (block == rs->last_sent_block) {
 808         offset |= RAM_SAVE_FLAG_CONTINUE;
 809     }
 810     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 811         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 812             if (bytes_xmit > 0) {
 813                 rs->norm_pages++;
 814             } else if (bytes_xmit == 0) {
 815                 rs->zero_pages++;
 816             }
 817         }
 818     } else {
 819         pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
 820         if (pages > 0) {
 821             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 822              * page would be stale
 823              */
 824             xbzrle_cache_zero_page(rs, current_addr);
 825             ram_release_pages(ms, block->idstr, pss->offset, pages);
 826         } else if (!rs->ram_bulk_stage &&
 827                    !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
 828             pages = save_xbzrle_page(rs, f, &p, current_addr, block,
 829                                      offset, last_stage, bytes_transferred);
 830             if (!last_stage) {
 831                 /* Can't send this cached data async, since the cache page
 832                  * might get updated before it gets to the wire
 833                  */
 834                 send_async = false;
 835             }
 836         }
 837     }
 838
 839     /* XBZRLE overflow or normal page */
 840     if (pages == -1) {
 841         *bytes_transferred += save_page_header(f, block,
 842                                                offset | RAM_SAVE_FLAG_PAGE);
 843         if (send_async) {
 844             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
 845                                   migrate_release_ram() &
 846                                   migration_in_postcopy(ms));
 847         } else {
 848             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 849         }
 850         *bytes_transferred += TARGET_PAGE_SIZE;
 851         pages = 1;
 852         rs->norm_pages++;
 853     }
 854
 855     XBZRLE_cache_unlock();
 856
 857     return pages;
 858 }
 859
 860 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 861                                 ram_addr_t offset)
 862 {
 863     int bytes_sent, blen;
 864     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 865
 866     bytes_sent = save_page_header(f, block, offset |
 867                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 868     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 869                                      migrate_compress_level());
 870     if (blen < 0) {
 871         bytes_sent = 0;
 872         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 873         error_report("compressed data failed!");
 874     } else {
 875         bytes_sent += blen;
 876         ram_release_pages(migrate_get_current(), block->idstr,
 877                           offset & TARGET_PAGE_MASK, 1);
 878     }
 879
 880     return bytes_sent;
 881 }
 882
 883 static uint64_t bytes_transferred;
 884
 885 static void flush_compressed_data(QEMUFile *f)
 886 {
 887     int idx, len, thread_count;
 888
 889     if (!migrate_use_compression()) {
 890         return;
 891     }
 892     thread_count = migrate_compress_threads();
 893
 894     qemu_mutex_lock(&comp_done_lock);
 895     for (idx = 0; idx < thread_count; idx++) {
 896         while (!comp_param[idx].done) {
 897             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 898         }
 899     }
 900     qemu_mutex_unlock(&comp_done_lock);
 901
 902     for (idx = 0; idx < thread_count; idx++) {
 903         qemu_mutex_lock(&comp_param[idx].mutex);
 904         if (!comp_param[idx].quit) {
 905             len = qemu_put_qemu_file(f, comp_param[idx].file);
 906             bytes_transferred += len;
 907         }
 908         qemu_mutex_unlock(&comp_param[idx].mutex);
 909     }
 910 }
 911
 912 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 913                                        ram_addr_t offset)
 914 {
 915     param->block = block;
 916     param->offset = offset;
 917 }
 918
 919 static int compress_page_with_multi_thread(RAMState *rs, QEMUFile *f,
 920                                            RAMBlock *block, ram_addr_t offset,
 921                                            uint64_t *bytes_transferred)
 922 {
 923     int idx, thread_count, bytes_xmit = -1, pages = -1;
 924
 925     thread_count = migrate_compress_threads();
 926     qemu_mutex_lock(&comp_done_lock);
 927     while (true) {
 928         for (idx = 0; idx < thread_count; idx++) {
 929             if (comp_param[idx].done) {
 930                 comp_param[idx].done = false;
 931                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 932                 qemu_mutex_lock(&comp_param[idx].mutex);
 933                 set_compress_params(&comp_param[idx], block, offset);
 934                 qemu_cond_signal(&comp_param[idx].cond);
 935                 qemu_mutex_unlock(&comp_param[idx].mutex);
 936                 pages = 1;
 937                 rs->norm_pages++;
 938                 *bytes_transferred += bytes_xmit;
 939                 break;
 940             }
 941         }
 942         if (pages > 0) {
 943             break;
 944         } else {
 945             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 946         }
 947     }
 948     qemu_mutex_unlock(&comp_done_lock);
 949
 950     return pages;
 951 }
 952
 953 /**
 954  * ram_save_compressed_page: compress the given page and send it to the stream
 955  *
 956  * Returns the number of pages written.
 957  *
 958  * @rs: current RAM state
 959  * @ms: current migration state
 960  * @f: QEMUFile where to send the data
 961  * @block: block that contains the page we want to send
 962  * @offset: offset inside the block for the page
 963  * @last_stage: if we are at the completion stage
 964  * @bytes_transferred: increase it with the number of transferred bytes
 965  */
 966 static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
 967                                     QEMUFile *f,
 968                                     PageSearchStatus *pss, bool last_stage,
 969                                     uint64_t *bytes_transferred)
 970 {
 971     int pages = -1;
 972     uint64_t bytes_xmit = 0;
 973     uint8_t *p;
 974     int ret, blen;
 975     RAMBlock *block = pss->block;
 976     ram_addr_t offset = pss->offset;
 977
 978     p = block->host + offset;
 979
 980     ret = ram_control_save_page(f, block->offset,
 981                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 982     if (bytes_xmit) {
 983         *bytes_transferred += bytes_xmit;
 984         pages = 1;
 985     }
 986     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 987         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 988             if (bytes_xmit > 0) {
 989                 rs->norm_pages++;
 990             } else if (bytes_xmit == 0) {
 991                 rs->zero_pages++;
 992             }
 993         }
 994     } else {
 995         /* When starting the process of a new block, the first page of
 996          * the block should be sent out before other pages in the same
 997          * block, and all the pages in last block should have been sent
 998          * out, keeping this order is important, because the 'cont' flag
 999          * is used to avoid resending the block name.
1000          */
1001         if (block != rs->last_sent_block) {
1002             flush_compressed_data(f);
1003             pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
1004             if (pages == -1) {
1005                 /* Make sure the first page is sent out before other pages */
1006                 bytes_xmit = save_page_header(f, block, offset |
1007                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1008                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1009                                                  migrate_compress_level());
1010                 if (blen > 0) {
1011                     *bytes_transferred += bytes_xmit + blen;
1012                     rs->norm_pages++;
1013                     pages = 1;
1014                 } else {
1015                     qemu_file_set_error(f, blen);
1016                     error_report("compressed data failed!");
1017                 }
1018             }
1019             if (pages > 0) {
1020                 ram_release_pages(ms, block->idstr, pss->offset, pages);
1021             }
1022         } else {
1023             offset |= RAM_SAVE_FLAG_CONTINUE;
1024             pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
1025             if (pages == -1) {
1026                 pages = compress_page_with_multi_thread(rs, f, block, offset,
1027                                                         bytes_transferred);
1028             } else {
1029                 ram_release_pages(ms, block->idstr, pss->offset, pages);
1030             }
1031         }
1032     }
1033
1034     return pages;
1035 }
1036
1037 /**
1038  * find_dirty_block: find the next dirty page and update any state
1039  * associated with the search process.
1040  *
1041  * Returns if a page is found
1042  *
1043  * @rs: current RAM state
1044  * @f: QEMUFile where to send the data
1045  * @pss: data about the state of the current dirty page scan
1046  * @again: set to false if the search has scanned the whole of RAM
1047  * @ram_addr_abs: pointer into which to store the address of the dirty page
1048  *                within the global ram_addr space
1049  */
1050 static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
1051                              bool *again, ram_addr_t *ram_addr_abs)
1052 {
1053     pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
1054                                               ram_addr_abs);
1055     if (pss->complete_round && pss->block == rs->last_seen_block &&
1056         pss->offset >= rs->last_offset) {
1057         /*
1058          * We've been once around the RAM and haven't found anything.
1059          * Give up.
1060          */
1061         *again = false;
1062         return false;
1063     }
1064     if (pss->offset >= pss->block->used_length) {
1065         /* Didn't find anything in this RAM Block */
1066         pss->offset = 0;
1067         pss->block = QLIST_NEXT_RCU(pss->block, next);
1068         if (!pss->block) {
1069             /* Hit the end of the list */
1070             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1071             /* Flag that we've looped */
1072             pss->complete_round = true;
1073             rs->ram_bulk_stage = false;
1074             if (migrate_use_xbzrle()) {
1075                 /* If xbzrle is on, stop using the data compression at this
1076                  * point. In theory, xbzrle can do better than compression.
1077                  */
1078                 flush_compressed_data(f);
1079                 compression_switch = false;
1080             }
1081         }
1082         /* Didn't find anything this time, but try again on the new block */
1083         *again = true;
1084         return false;
1085     } else {
1086         /* Can go around again, but... */
1087         *again = true;
1088         /* We've found something so probably don't need to */
1089         return true;
1090     }
1091 }
1092
1093 /**
1094  * unqueue_page: gets a page of the queue
1095  *
1096  * Helper for 'get_queued_page' - gets a page off the queue
1097  *
1098  * Returns the block of the page (or NULL if none available)
1099  *
1100  * @ms: current migration state
1101  * @offset: used to return the offset within the RAMBlock
1102  * @ram_addr_abs: pointer into which to store the address of the dirty page
1103  *                within the global ram_addr space
1104  */
1105 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1106                               ram_addr_t *ram_addr_abs)
1107 {
1108     RAMBlock *block = NULL;
1109
1110     qemu_mutex_lock(&ms->src_page_req_mutex);
1111     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1112         struct MigrationSrcPageRequest *entry =
1113                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1114         block = entry->rb;
1115         *offset = entry->offset;
1116         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1117                         TARGET_PAGE_MASK;
1118
1119         if (entry->len > TARGET_PAGE_SIZE) {
1120             entry->len -= TARGET_PAGE_SIZE;
1121             entry->offset += TARGET_PAGE_SIZE;
1122         } else {
1123             memory_region_unref(block->mr);
1124             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1125             g_free(entry);
1126         }
1127     }
1128     qemu_mutex_unlock(&ms->src_page_req_mutex);
1129
1130     return block;
1131 }
1132
1133 /**
1134  * get_queued_page: unqueue a page from the postocpy requests
1135  *
1136  * Skips pages that are already sent (!dirty)
1137  *
1138  * Returns if a queued page is found
1139  *
1140  * @rs: current RAM state
1141  * @ms: current migration state
1142  * @pss: data about the state of the current dirty page scan
1143  * @ram_addr_abs: pointer into which to store the address of the dirty page
1144  *                within the global ram_addr space
1145  */
1146 static bool get_queued_page(RAMState *rs, MigrationState *ms,
1147                             PageSearchStatus *pss,
1148                             ram_addr_t *ram_addr_abs)
1149 {
1150     RAMBlock  *block;
1151     ram_addr_t offset;
1152     bool dirty;
1153
1154     do {
1155         block = unqueue_page(ms, &offset, ram_addr_abs);
1156         /*
1157          * We're sending this page, and since it's postcopy nothing else
1158          * will dirty it, and we must make sure it doesn't get sent again
1159          * even if this queue request was received after the background
1160          * search already sent it.
1161          */
1162         if (block) {
1163             unsigned long *bitmap;
1164             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1165             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1166             if (!dirty) {
1167                 trace_get_queued_page_not_dirty(
1168                     block->idstr, (uint64_t)offset,
1169                     (uint64_t)*ram_addr_abs,
1170                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1171                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1172             } else {
1173                 trace_get_queued_page(block->idstr,
1174                                       (uint64_t)offset,
1175                                       (uint64_t)*ram_addr_abs);
1176             }
1177         }
1178
1179     } while (block && !dirty);
1180
1181     if (block) {
1182         /*
1183          * As soon as we start servicing pages out of order, then we have
1184          * to kill the bulk stage, since the bulk stage assumes
1185          * in (migration_bitmap_find_and_reset_dirty) that every page is
1186          * dirty, that's no longer true.
1187          */
1188         rs->ram_bulk_stage = false;
1189
1190         /*
1191          * We want the background search to continue from the queued page
1192          * since the guest is likely to want other pages near to the page
1193          * it just requested.
1194          */
1195         pss->block = block;
1196         pss->offset = offset;
1197     }
1198
1199     return !!block;
1200 }
1201
1202 /**
1203  * migration_page_queue_free: drop any remaining pages in the ram
1204  * request queue
1205  *
1206  * It should be empty at the end anyway, but in error cases there may
1207  * be some left.  in case that there is any page left, we drop it.
1208  *
1209  * @ms: current migration state
1210  */
1211 void migration_page_queue_free(MigrationState *ms)
1212 {
1213     struct MigrationSrcPageRequest *mspr, *next_mspr;
1214     /* This queue generally should be empty - but in the case of a failed
1215      * migration might have some droppings in.
1216      */
1217     rcu_read_lock();
1218     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1219         memory_region_unref(mspr->rb->mr);
1220         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1221         g_free(mspr);
1222     }
1223     rcu_read_unlock();
1224 }
1225
1226 /**
1227  * ram_save_queue_pages: queue the page for transmission
1228  *
1229  * A request from postcopy destination for example.
1230  *
1231  * Returns zero on success or negative on error
1232  *
1233  * @ms: current migration state
1234  * @rbname: Name of the RAMBLock of the request. NULL means the
1235  *          same that last one.
1236  * @start: starting address from the start of the RAMBlock
1237  * @len: length (in bytes) to send
1238  */
1239 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1240                          ram_addr_t start, ram_addr_t len)
1241 {
1242     RAMBlock *ramblock;
1243
1244     ms->postcopy_requests++;
1245     rcu_read_lock();
1246     if (!rbname) {
1247         /* Reuse last RAMBlock */
1248         ramblock = ms->last_req_rb;
1249
1250         if (!ramblock) {
1251             /*
1252              * Shouldn't happen, we can't reuse the last RAMBlock if
1253              * it's the 1st request.
1254              */
1255             error_report("ram_save_queue_pages no previous block");
1256             goto err;
1257         }
1258     } else {
1259         ramblock = qemu_ram_block_by_name(rbname);
1260
1261         if (!ramblock) {
1262             /* We shouldn't be asked for a non-existent RAMBlock */
1263             error_report("ram_save_queue_pages no block '%s'", rbname);
1264             goto err;
1265         }
1266         ms->last_req_rb = ramblock;
1267     }
1268     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1269     if (start+len > ramblock->used_length) {
1270         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1271                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1272                      __func__, start, len, ramblock->used_length);
1273         goto err;
1274     }
1275
1276     struct MigrationSrcPageRequest *new_entry =
1277         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1278     new_entry->rb = ramblock;
1279     new_entry->offset = start;
1280     new_entry->len = len;
1281
1282     memory_region_ref(ramblock->mr);
1283     qemu_mutex_lock(&ms->src_page_req_mutex);
1284     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1285     qemu_mutex_unlock(&ms->src_page_req_mutex);
1286     rcu_read_unlock();
1287
1288     return 0;
1289
1290 err:
1291     rcu_read_unlock();
1292     return -1;
1293 }
1294
1295 /**
1296  * ram_save_target_page: save one target page
1297  *
1298  * Returns the number of pages written
1299  *
1300  * @rs: current RAM state
1301  * @ms: current migration state
1302  * @f: QEMUFile where to send the data
1303  * @pss: data about the page we want to send
1304  * @last_stage: if we are at the completion stage
1305  * @bytes_transferred: increase it with the number of transferred bytes
1306  * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
1307  */
1308 static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
1309                                 PageSearchStatus *pss,
1310                                 bool last_stage,
1311                                 uint64_t *bytes_transferred,
1312                                 ram_addr_t dirty_ram_abs)
1313 {
1314     int res = 0;
1315
1316     /* Check the pages is dirty and if it is send it */
1317     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1318         unsigned long *unsentmap;
1319         if (compression_switch && migrate_use_compression()) {
1320             res = ram_save_compressed_page(rs, ms, f, pss,
1321                                            last_stage,
1322                                            bytes_transferred);
1323         } else {
1324             res = ram_save_page(rs, ms, f, pss, last_stage,
1325                                 bytes_transferred);
1326         }
1327
1328         if (res < 0) {
1329             return res;
1330         }
1331         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1332         if (unsentmap) {
1333             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1334         }
1335         /* Only update last_sent_block if a block was actually sent; xbzrle
1336          * might have decided the page was identical so didn't bother writing
1337          * to the stream.
1338          */
1339         if (res > 0) {
1340             rs->last_sent_block = pss->block;
1341         }
1342     }
1343
1344     return res;
1345 }
1346
1347 /**
1348  * ram_save_host_page: save a whole host page
1349  *
1350  * Starting at *offset send pages up to the end of the current host
1351  * page. It's valid for the initial offset to point into the middle of
1352  * a host page in which case the remainder of the hostpage is sent.
1353  * Only dirty target pages are sent. Note that the host page size may
1354  * be a huge page for this block.
1355  *
1356  * Returns the number of pages written or negative on error
1357  *
1358  * @rs: current RAM state
1359  * @ms: current migration state
1360  * @f: QEMUFile where to send the data
1361  * @pss: data about the page we want to send
1362  * @last_stage: if we are at the completion stage
1363  * @bytes_transferred: increase it with the number of transferred bytes
1364  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1365  */
1366 static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
1367                               PageSearchStatus *pss,
1368                               bool last_stage,
1369                               uint64_t *bytes_transferred,
1370                               ram_addr_t dirty_ram_abs)
1371 {
1372     int tmppages, pages = 0;
1373     size_t pagesize = qemu_ram_pagesize(pss->block);
1374
1375     do {
1376         tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
1377                                         bytes_transferred, dirty_ram_abs);
1378         if (tmppages < 0) {
1379             return tmppages;
1380         }
1381
1382         pages += tmppages;
1383         pss->offset += TARGET_PAGE_SIZE;
1384         dirty_ram_abs += TARGET_PAGE_SIZE;
1385     } while (pss->offset & (pagesize - 1));
1386
1387     /* The offset we leave with is the last one we looked at */
1388     pss->offset -= TARGET_PAGE_SIZE;
1389     return pages;
1390 }
1391
1392 /**
1393  * ram_find_and_save_block: finds a dirty page and sends it to f
1394  *
1395  * Called within an RCU critical section.
1396  *
1397  * Returns the number of pages written where zero means no dirty pages
1398  *
1399  * @rs: current RAM state
1400  * @f: QEMUFile where to send the data
1401  * @last_stage: if we are at the completion stage
1402  * @bytes_transferred: increase it with the number of transferred bytes
1403  *
1404  * On systems where host-page-size > target-page-size it will send all the
1405  * pages in a host page that are dirty.
1406  */
1407
1408 static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage,
1409                                    uint64_t *bytes_transferred)
1410 {
1411     PageSearchStatus pss;
1412     MigrationState *ms = migrate_get_current();
1413     int pages = 0;
1414     bool again, found;
1415     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1416                                  ram_addr_t space */
1417
1418     /* No dirty page as there is zero RAM */
1419     if (!ram_bytes_total()) {
1420         return pages;
1421     }
1422
1423     pss.block = rs->last_seen_block;
1424     pss.offset = rs->last_offset;
1425     pss.complete_round = false;
1426
1427     if (!pss.block) {
1428         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1429     }
1430
1431     do {
1432         again = true;
1433         found = get_queued_page(rs, ms, &pss, &dirty_ram_abs);
1434
1435         if (!found) {
1436             /* priority queue empty, so just search for something dirty */
1437             found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
1438         }
1439
1440         if (found) {
1441             pages = ram_save_host_page(rs, ms, f, &pss,
1442                                        last_stage, bytes_transferred,
1443                                        dirty_ram_abs);
1444         }
1445     } while (!pages && again);
1446
1447     rs->last_seen_block = pss.block;
1448     rs->last_offset = pss.offset;
1449
1450     return pages;
1451 }
1452
1453 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1454 {
1455     uint64_t pages = size / TARGET_PAGE_SIZE;
1456     RAMState *rs = &ram_state;
1457
1458     if (zero) {
1459         rs->zero_pages += pages;
1460     } else {
1461         rs->norm_pages += pages;
1462         bytes_transferred += size;
1463         qemu_update_position(f, size);
1464     }
1465 }
1466
1467 static ram_addr_t ram_save_remaining(void)
1468 {
1469     return migration_dirty_pages;
1470 }
1471
1472 uint64_t ram_bytes_remaining(void)
1473 {
1474     return ram_save_remaining() * TARGET_PAGE_SIZE;
1475 }
1476
1477 uint64_t ram_bytes_transferred(void)
1478 {
1479     return bytes_transferred;
1480 }
1481
1482 uint64_t ram_bytes_total(void)
1483 {
1484     RAMBlock *block;
1485     uint64_t total = 0;
1486
1487     rcu_read_lock();
1488     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1489         total += block->used_length;
1490     rcu_read_unlock();
1491     return total;
1492 }
1493
1494 void free_xbzrle_decoded_buf(void)
1495 {
1496     g_free(xbzrle_decoded_buf);
1497     xbzrle_decoded_buf = NULL;
1498 }
1499
1500 static void migration_bitmap_free(struct BitmapRcu *bmap)
1501 {
1502     g_free(bmap->bmap);
1503     g_free(bmap->unsentmap);
1504     g_free(bmap);
1505 }
1506
1507 static void ram_migration_cleanup(void *opaque)
1508 {
1509     /* caller have hold iothread lock or is in a bh, so there is
1510      * no writing race against this migration_bitmap
1511      */
1512     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1513     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1514     if (bitmap) {
1515         memory_global_dirty_log_stop();
1516         call_rcu(bitmap, migration_bitmap_free, rcu);
1517     }
1518
1519     XBZRLE_cache_lock();
1520     if (XBZRLE.cache) {
1521         cache_fini(XBZRLE.cache);
1522         g_free(XBZRLE.encoded_buf);
1523         g_free(XBZRLE.current_buf);
1524         g_free(ZERO_TARGET_PAGE);
1525         XBZRLE.cache = NULL;
1526         XBZRLE.encoded_buf = NULL;
1527         XBZRLE.current_buf = NULL;
1528     }
1529     XBZRLE_cache_unlock();
1530 }
1531
1532 static void ram_state_reset(RAMState *rs)
1533 {
1534     rs->last_seen_block = NULL;
1535     rs->last_sent_block = NULL;
1536     rs->last_offset = 0;
1537     rs->last_version = ram_list.version;
1538     rs->ram_bulk_stage = true;
1539 }
1540
1541 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1542
1543 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1544 {
1545     /* called in qemu main thread, so there is
1546      * no writing race against this migration_bitmap
1547      */
1548     if (migration_bitmap_rcu) {
1549         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1550         bitmap = g_new(struct BitmapRcu, 1);
1551         bitmap->bmap = bitmap_new(new);
1552
1553         /* prevent migration_bitmap content from being set bit
1554          * by migration_bitmap_sync_range() at the same time.
1555          * it is safe to migration if migration_bitmap is cleared bit
1556          * at the same time.
1557          */
1558         qemu_mutex_lock(&migration_bitmap_mutex);
1559         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1560         bitmap_set(bitmap->bmap, old, new - old);
1561
1562         /* We don't have a way to safely extend the sentmap
1563          * with RCU; so mark it as missing, entry to postcopy
1564          * will fail.
1565          */
1566         bitmap->unsentmap = NULL;
1567
1568         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1569         qemu_mutex_unlock(&migration_bitmap_mutex);
1570         migration_dirty_pages += new - old;
1571         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1572     }
1573 }
1574
1575 /*
1576  * 'expected' is the value you expect the bitmap mostly to be full
1577  * of; it won't bother printing lines that are all this value.
1578  * If 'todump' is null the migration bitmap is dumped.
1579  */
1580 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1581 {
1582     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1583
1584     int64_t cur;
1585     int64_t linelen = 128;
1586     char linebuf[129];
1587
1588     if (!todump) {
1589         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1590     }
1591
1592     for (cur = 0; cur < ram_pages; cur += linelen) {
1593         int64_t curb;
1594         bool found = false;
1595         /*
1596          * Last line; catch the case where the line length
1597          * is longer than remaining ram
1598          */
1599         if (cur + linelen > ram_pages) {
1600             linelen = ram_pages - cur;
1601         }
1602         for (curb = 0; curb < linelen; curb++) {
1603             bool thisbit = test_bit(cur + curb, todump);
1604             linebuf[curb] = thisbit ? '1' : '.';
1605             found = found || (thisbit != expected);
1606         }
1607         if (found) {
1608             linebuf[curb] = '\0';
1609             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1610         }
1611     }
1612 }
1613
1614 /* **** functions for postcopy ***** */
1615
1616 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1617 {
1618     struct RAMBlock *block;
1619     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1620
1621     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1622         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1623         unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1624         unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1625
1626         while (run_start < range) {
1627             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1628             ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1629                               (run_end - run_start) << TARGET_PAGE_BITS);
1630             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1631         }
1632     }
1633 }
1634
1635 /**
1636  * postcopy_send_discard_bm_ram: discard a RAMBlock
1637  *
1638  * Returns zero on success
1639  *
1640  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1641  * Note: At this point the 'unsentmap' is the processed bitmap combined
1642  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1643  *
1644  * @ms: current migration state
1645  * @pds: state for postcopy
1646  * @start: RAMBlock starting page
1647  * @length: RAMBlock size
1648  */
1649 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1650                                         PostcopyDiscardState *pds,
1651                                         unsigned long start,
1652                                         unsigned long length)
1653 {
1654     unsigned long end = start + length; /* one after the end */
1655     unsigned long current;
1656     unsigned long *unsentmap;
1657
1658     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1659     for (current = start; current < end; ) {
1660         unsigned long one = find_next_bit(unsentmap, end, current);
1661
1662         if (one <= end) {
1663             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1664             unsigned long discard_length;
1665
1666             if (zero >= end) {
1667                 discard_length = end - one;
1668             } else {
1669                 discard_length = zero - one;
1670             }
1671             if (discard_length) {
1672                 postcopy_discard_send_range(ms, pds, one, discard_length);
1673             }
1674             current = one + discard_length;
1675         } else {
1676             current = one;
1677         }
1678     }
1679
1680     return 0;
1681 }
1682
1683 /**
1684  * postcopy_each_ram_send_discard: discard all RAMBlocks
1685  *
1686  * Returns 0 for success or negative for error
1687  *
1688  * Utility for the outgoing postcopy code.
1689  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1690  *   passing it bitmap indexes and name.
1691  * (qemu_ram_foreach_block ends up passing unscaled lengths
1692  *  which would mean postcopy code would have to deal with target page)
1693  *
1694  * @ms: current migration state
1695  */
1696 static int postcopy_each_ram_send_discard(MigrationState *ms)
1697 {
1698     struct RAMBlock *block;
1699     int ret;
1700
1701     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1702         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1703         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1704                                                                first,
1705                                                                block->idstr);
1706
1707         /*
1708          * Postcopy sends chunks of bitmap over the wire, but it
1709          * just needs indexes at this point, avoids it having
1710          * target page specific code.
1711          */
1712         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1713                                     block->used_length >> TARGET_PAGE_BITS);
1714         postcopy_discard_send_finish(ms, pds);
1715         if (ret) {
1716             return ret;
1717         }
1718     }
1719
1720     return 0;
1721 }
1722
1723 /**
1724  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1725  *
1726  * Helper for postcopy_chunk_hostpages; it's called twice to
1727  * canonicalize the two bitmaps, that are similar, but one is
1728  * inverted.
1729  *
1730  * Postcopy requires that all target pages in a hostpage are dirty or
1731  * clean, not a mix.  This function canonicalizes the bitmaps.
1732  *
1733  * @ms: current migration state
1734  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1735  *               otherwise we need to canonicalize partially dirty host pages
1736  * @block: block that contains the page we want to canonicalize
1737  * @pds: state for postcopy
1738  */
1739 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1740                                           RAMBlock *block,
1741                                           PostcopyDiscardState *pds)
1742 {
1743     unsigned long *bitmap;
1744     unsigned long *unsentmap;
1745     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1746     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1747     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1748     unsigned long last = first + (len - 1);
1749     unsigned long run_start;
1750
1751     if (block->page_size == TARGET_PAGE_SIZE) {
1752         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1753         return;
1754     }
1755
1756     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1757     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1758
1759     if (unsent_pass) {
1760         /* Find a sent page */
1761         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1762     } else {
1763         /* Find a dirty page */
1764         run_start = find_next_bit(bitmap, last + 1, first);
1765     }
1766
1767     while (run_start <= last) {
1768         bool do_fixup = false;
1769         unsigned long fixup_start_addr;
1770         unsigned long host_offset;
1771
1772         /*
1773          * If the start of this run of pages is in the middle of a host
1774          * page, then we need to fixup this host page.
1775          */
1776         host_offset = run_start % host_ratio;
1777         if (host_offset) {
1778             do_fixup = true;
1779             run_start -= host_offset;
1780             fixup_start_addr = run_start;
1781             /* For the next pass */
1782             run_start = run_start + host_ratio;
1783         } else {
1784             /* Find the end of this run */
1785             unsigned long run_end;
1786             if (unsent_pass) {
1787                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1788             } else {
1789                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1790             }
1791             /*
1792              * If the end isn't at the start of a host page, then the
1793              * run doesn't finish at the end of a host page
1794              * and we need to discard.
1795              */
1796             host_offset = run_end % host_ratio;
1797             if (host_offset) {
1798                 do_fixup = true;
1799                 fixup_start_addr = run_end - host_offset;
1800                 /*
1801                  * This host page has gone, the next loop iteration starts
1802                  * from after the fixup
1803                  */
1804                 run_start = fixup_start_addr + host_ratio;
1805             } else {
1806                 /*
1807                  * No discards on this iteration, next loop starts from
1808                  * next sent/dirty page
1809                  */
1810                 run_start = run_end + 1;
1811             }
1812         }
1813
1814         if (do_fixup) {
1815             unsigned long page;
1816
1817             /* Tell the destination to discard this page */
1818             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1819                 /* For the unsent_pass we:
1820                  *     discard partially sent pages
1821                  * For the !unsent_pass (dirty) we:
1822                  *     discard partially dirty pages that were sent
1823                  *     (any partially sent pages were already discarded
1824                  *     by the previous unsent_pass)
1825                  */
1826                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1827                                             host_ratio);
1828             }
1829
1830             /* Clean up the bitmap */
1831             for (page = fixup_start_addr;
1832                  page < fixup_start_addr + host_ratio; page++) {
1833                 /* All pages in this host page are now not sent */
1834                 set_bit(page, unsentmap);
1835
1836                 /*
1837                  * Remark them as dirty, updating the count for any pages
1838                  * that weren't previously dirty.
1839                  */
1840                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1841             }
1842         }
1843
1844         if (unsent_pass) {
1845             /* Find the next sent page for the next iteration */
1846             run_start = find_next_zero_bit(unsentmap, last + 1,
1847                                            run_start);
1848         } else {
1849             /* Find the next dirty page for the next iteration */
1850             run_start = find_next_bit(bitmap, last + 1, run_start);
1851         }
1852     }
1853 }
1854
1855 /**
1856  * postcopy_chuck_hostpages: discrad any partially sent host page
1857  *
1858  * Utility for the outgoing postcopy code.
1859  *
1860  * Discard any partially sent host-page size chunks, mark any partially
1861  * dirty host-page size chunks as all dirty.  In this case the host-page
1862  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1863  *
1864  * Returns zero on success
1865  *
1866  * @ms: current migration state
1867  */
1868 static int postcopy_chunk_hostpages(MigrationState *ms)
1869 {
1870     RAMState *rs = &ram_state;
1871     struct RAMBlock *block;
1872
1873     /* Easiest way to make sure we don't resume in the middle of a host-page */
1874     rs->last_seen_block = NULL;
1875     rs->last_sent_block = NULL;
1876     rs->last_offset     = 0;
1877
1878     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1879         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1880
1881         PostcopyDiscardState *pds =
1882                          postcopy_discard_send_init(ms, first, block->idstr);
1883
1884         /* First pass: Discard all partially sent host pages */
1885         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1886         /*
1887          * Second pass: Ensure that all partially dirty host pages are made
1888          * fully dirty.
1889          */
1890         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1891
1892         postcopy_discard_send_finish(ms, pds);
1893     } /* ram_list loop */
1894
1895     return 0;
1896 }
1897
1898 /**
1899  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1900  *
1901  * Returns zero on success
1902  *
1903  * Transmit the set of pages to be discarded after precopy to the target
1904  * these are pages that:
1905  *     a) Have been previously transmitted but are now dirty again
1906  *     b) Pages that have never been transmitted, this ensures that
1907  *        any pages on the destination that have been mapped by background
1908  *        tasks get discarded (transparent huge pages is the specific concern)
1909  * Hopefully this is pretty sparse
1910  *
1911  * @ms: current migration state
1912  */
1913 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1914 {
1915     int ret;
1916     unsigned long *bitmap, *unsentmap;
1917
1918     rcu_read_lock();
1919
1920     /* This should be our last sync, the src is now paused */
1921     migration_bitmap_sync(&ram_state);
1922
1923     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1924     if (!unsentmap) {
1925         /* We don't have a safe way to resize the sentmap, so
1926          * if the bitmap was resized it will be NULL at this
1927          * point.
1928          */
1929         error_report("migration ram resized during precopy phase");
1930         rcu_read_unlock();
1931         return -EINVAL;
1932     }
1933
1934     /* Deal with TPS != HPS and huge pages */
1935     ret = postcopy_chunk_hostpages(ms);
1936     if (ret) {
1937         rcu_read_unlock();
1938         return ret;
1939     }
1940
1941     /*
1942      * Update the unsentmap to be unsentmap = unsentmap | dirty
1943      */
1944     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1945     bitmap_or(unsentmap, unsentmap, bitmap,
1946                last_ram_offset() >> TARGET_PAGE_BITS);
1947
1948
1949     trace_ram_postcopy_send_discard_bitmap();
1950 #ifdef DEBUG_POSTCOPY
1951     ram_debug_dump_bitmap(unsentmap, true);
1952 #endif
1953
1954     ret = postcopy_each_ram_send_discard(ms);
1955     rcu_read_unlock();
1956
1957     return ret;
1958 }
1959
1960 /**
1961  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1962  *
1963  * Returns zero on success
1964  *
1965  * @mis: current migration incoming state
1966  * @rbname: name of the RAMBlock of the request. NULL means the
1967  *          same that last one.
1968  * @start: RAMBlock starting page
1969  * @length: RAMBlock size
1970  */
1971 int ram_discard_range(MigrationIncomingState *mis,
1972                       const char *rbname,
1973                       uint64_t start, size_t length)
1974 {
1975     int ret = -1;
1976
1977     trace_ram_discard_range(rbname, start, length);
1978
1979     rcu_read_lock();
1980     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1981
1982     if (!rb) {
1983         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1984         goto err;
1985     }
1986
1987     ret = ram_block_discard_range(rb, start, length);
1988
1989 err:
1990     rcu_read_unlock();
1991
1992     return ret;
1993 }
1994
1995 static int ram_save_init_globals(RAMState *rs)
1996 {
1997     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1998
1999     rs->dirty_rate_high_cnt = 0;
2000     rs->bitmap_sync_count = 0;
2001     rs->zero_pages = 0;
2002     rs->norm_pages = 0;
2003     rs->iterations = 0;
2004     rs->xbzrle_bytes = 0;
2005     rs->xbzrle_pages = 0;
2006     rs->xbzrle_cache_miss = 0;
2007     rs->xbzrle_cache_miss_rate = 0;
2008     migration_bitmap_sync_init(rs);
2009     qemu_mutex_init(&migration_bitmap_mutex);
2010
2011     if (migrate_use_xbzrle()) {
2012         XBZRLE_cache_lock();
2013         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
2014         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2015                                   TARGET_PAGE_SIZE,
2016                                   TARGET_PAGE_SIZE);
2017         if (!XBZRLE.cache) {
2018             XBZRLE_cache_unlock();
2019             error_report("Error creating cache");
2020             return -1;
2021         }
2022         XBZRLE_cache_unlock();
2023
2024         /* We prefer not to abort if there is no memory */
2025         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2026         if (!XBZRLE.encoded_buf) {
2027             error_report("Error allocating encoded_buf");
2028             return -1;
2029         }
2030
2031         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2032         if (!XBZRLE.current_buf) {
2033             error_report("Error allocating current_buf");
2034             g_free(XBZRLE.encoded_buf);
2035             XBZRLE.encoded_buf = NULL;
2036             return -1;
2037         }
2038
2039         acct_clear();
2040     }
2041
2042     /* For memory_global_dirty_log_start below.  */
2043     qemu_mutex_lock_iothread();
2044
2045     qemu_mutex_lock_ramlist();
2046     rcu_read_lock();
2047     bytes_transferred = 0;
2048     ram_state_reset(rs);
2049
2050     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
2051     /* Skip setting bitmap if there is no RAM */
2052     if (ram_bytes_total()) {
2053         ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2054         migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
2055         bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
2056
2057         if (migrate_postcopy_ram()) {
2058             migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
2059             bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
2060         }
2061     }
2062
2063     /*
2064      * Count the total number of pages used by ram blocks not including any
2065      * gaps due to alignment or unplugs.
2066      */
2067     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2068
2069     memory_global_dirty_log_start();
2070     migration_bitmap_sync(rs);
2071     qemu_mutex_unlock_ramlist();
2072     qemu_mutex_unlock_iothread();
2073     rcu_read_unlock();
2074
2075     return 0;
2076 }
2077
2078 /*
2079  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2080  * long-running RCU critical section.  When rcu-reclaims in the code
2081  * start to become numerous it will be necessary to reduce the
2082  * granularity of these critical sections.
2083  */
2084
2085 /**
2086  * ram_save_setup: Setup RAM for migration
2087  *
2088  * Returns zero to indicate success and negative for error
2089  *
2090  * @f: QEMUFile where to send the data
2091  * @opaque: RAMState pointer
2092  */
2093 static int ram_save_setup(QEMUFile *f, void *opaque)
2094 {
2095     RAMState *rs = opaque;
2096     RAMBlock *block;
2097
2098     /* migration has already setup the bitmap, reuse it. */
2099     if (!migration_in_colo_state()) {
2100         if (ram_save_init_globals(rs) < 0) {
2101             return -1;
2102          }
2103     }
2104
2105     rcu_read_lock();
2106
2107     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2108
2109     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2110         qemu_put_byte(f, strlen(block->idstr));
2111         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2112         qemu_put_be64(f, block->used_length);
2113         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2114             qemu_put_be64(f, block->page_size);
2115         }
2116     }
2117
2118     rcu_read_unlock();
2119
2120     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2121     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2122
2123     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2124
2125     return 0;
2126 }
2127
2128 /**
2129  * ram_save_iterate: iterative stage for migration
2130  *
2131  * Returns zero to indicate success and negative for error
2132  *
2133  * @f: QEMUFile where to send the data
2134  * @opaque: RAMState pointer
2135  */
2136 static int ram_save_iterate(QEMUFile *f, void *opaque)
2137 {
2138     RAMState *rs = opaque;
2139     int ret;
2140     int i;
2141     int64_t t0;
2142     int done = 0;
2143
2144     rcu_read_lock();
2145     if (ram_list.version != rs->last_version) {
2146         ram_state_reset(rs);
2147     }
2148
2149     /* Read version before ram_list.blocks */
2150     smp_rmb();
2151
2152     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2153
2154     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2155     i = 0;
2156     while ((ret = qemu_file_rate_limit(f)) == 0) {
2157         int pages;
2158
2159         pages = ram_find_and_save_block(rs, f, false, &bytes_transferred);
2160         /* no more pages to sent */
2161         if (pages == 0) {
2162             done = 1;
2163             break;
2164         }
2165         rs->iterations++;
2166
2167         /* we want to check in the 1st loop, just in case it was the 1st time
2168            and we had to sync the dirty bitmap.
2169            qemu_get_clock_ns() is a bit expensive, so we only check each some
2170            iterations
2171         */
2172         if ((i & 63) == 0) {
2173             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2174             if (t1 > MAX_WAIT) {
2175                 trace_ram_save_iterate_big_wait(t1, i);
2176                 break;
2177             }
2178         }
2179         i++;
2180     }
2181     flush_compressed_data(f);
2182     rcu_read_unlock();
2183
2184     /*
2185      * Must occur before EOS (or any QEMUFile operation)
2186      * because of RDMA protocol.
2187      */
2188     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2189
2190     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2191     bytes_transferred += 8;
2192
2193     ret = qemu_file_get_error(f);
2194     if (ret < 0) {
2195         return ret;
2196     }
2197
2198     return done;
2199 }
2200
2201 /**
2202  * ram_save_complete: function called to send the remaining amount of ram
2203  *
2204  * Returns zero to indicate success
2205  *
2206  * Called with iothread lock
2207  *
2208  * @f: QEMUFile where to send the data
2209  * @opaque: RAMState pointer
2210  */
2211 static int ram_save_complete(QEMUFile *f, void *opaque)
2212 {
2213     RAMState *rs = opaque;
2214
2215     rcu_read_lock();
2216
2217     if (!migration_in_postcopy(migrate_get_current())) {
2218         migration_bitmap_sync(rs);
2219     }
2220
2221     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2222
2223     /* try transferring iterative blocks of memory */
2224
2225     /* flush all remaining blocks regardless of rate limiting */
2226     while (true) {
2227         int pages;
2228
2229         pages = ram_find_and_save_block(rs, f, !migration_in_colo_state(),
2230                                         &bytes_transferred);
2231         /* no more blocks to sent */
2232         if (pages == 0) {
2233             break;
2234         }
2235     }
2236
2237     flush_compressed_data(f);
2238     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2239
2240     rcu_read_unlock();
2241
2242     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2243
2244     return 0;
2245 }
2246
2247 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2248                              uint64_t *non_postcopiable_pending,
2249                              uint64_t *postcopiable_pending)
2250 {
2251     RAMState *rs = opaque;
2252     uint64_t remaining_size;
2253
2254     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2255
2256     if (!migration_in_postcopy(migrate_get_current()) &&
2257         remaining_size < max_size) {
2258         qemu_mutex_lock_iothread();
2259         rcu_read_lock();
2260         migration_bitmap_sync(rs);
2261         rcu_read_unlock();
2262         qemu_mutex_unlock_iothread();
2263         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2264     }
2265
2266     /* We can do postcopy, and all the data is postcopiable */
2267     *postcopiable_pending += remaining_size;
2268 }
2269
2270 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2271 {
2272     unsigned int xh_len;
2273     int xh_flags;
2274     uint8_t *loaded_data;
2275
2276     if (!xbzrle_decoded_buf) {
2277         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2278     }
2279     loaded_data = xbzrle_decoded_buf;
2280
2281     /* extract RLE header */
2282     xh_flags = qemu_get_byte(f);
2283     xh_len = qemu_get_be16(f);
2284
2285     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2286         error_report("Failed to load XBZRLE page - wrong compression!");
2287         return -1;
2288     }
2289
2290     if (xh_len > TARGET_PAGE_SIZE) {
2291         error_report("Failed to load XBZRLE page - len overflow!");
2292         return -1;
2293     }
2294     /* load data and decode */
2295     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2296
2297     /* decode RLE */
2298     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2299                              TARGET_PAGE_SIZE) == -1) {
2300         error_report("Failed to load XBZRLE page - decode error!");
2301         return -1;
2302     }
2303
2304     return 0;
2305 }
2306
2307 /**
2308  * ram_block_from_stream: read a RAMBlock id from the migration stream
2309  *
2310  * Must be called from within a rcu critical section.
2311  *
2312  * Returns a pointer from within the RCU-protected ram_list.
2313  *
2314  * @f: QEMUFile where to read the data from
2315  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2316  */
2317 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2318 {
2319     static RAMBlock *block = NULL;
2320     char id[256];
2321     uint8_t len;
2322
2323     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2324         if (!block) {
2325             error_report("Ack, bad migration stream!");
2326             return NULL;
2327         }
2328         return block;
2329     }
2330
2331     len = qemu_get_byte(f);
2332     qemu_get_buffer(f, (uint8_t *)id, len);
2333     id[len] = 0;
2334
2335     block = qemu_ram_block_by_name(id);
2336     if (!block) {
2337         error_report("Can't find block %s", id);
2338         return NULL;
2339     }
2340
2341     return block;
2342 }
2343
2344 static inline void *host_from_ram_block_offset(RAMBlock *block,
2345                                                ram_addr_t offset)
2346 {
2347     if (!offset_in_ramblock(block, offset)) {
2348         return NULL;
2349     }
2350
2351     return block->host + offset;
2352 }
2353
2354 /**
2355  * ram_handle_compressed: handle the zero page case
2356  *
2357  * If a page (or a whole RDMA chunk) has been
2358  * determined to be zero, then zap it.
2359  *
2360  * @host: host address for the zero page
2361  * @ch: what the page is filled from.  We only support zero
2362  * @size: size of the zero page
2363  */
2364 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2365 {
2366     if (ch != 0 || !is_zero_range(host, size)) {
2367         memset(host, ch, size);
2368     }
2369 }
2370
2371 static void *do_data_decompress(void *opaque)
2372 {
2373     DecompressParam *param = opaque;
2374     unsigned long pagesize;
2375     uint8_t *des;
2376     int len;
2377
2378     qemu_mutex_lock(&param->mutex);
2379     while (!param->quit) {
2380         if (param->des) {
2381             des = param->des;
2382             len = param->len;
2383             param->des = 0;
2384             qemu_mutex_unlock(&param->mutex);
2385
2386             pagesize = TARGET_PAGE_SIZE;
2387             /* uncompress() will return failed in some case, especially
2388              * when the page is dirted when doing the compression, it's
2389              * not a problem because the dirty page will be retransferred
2390              * and uncompress() won't break the data in other pages.
2391              */
2392             uncompress((Bytef *)des, &pagesize,
2393                        (const Bytef *)param->compbuf, len);
2394
2395             qemu_mutex_lock(&decomp_done_lock);
2396             param->done = true;
2397             qemu_cond_signal(&decomp_done_cond);
2398             qemu_mutex_unlock(&decomp_done_lock);
2399
2400             qemu_mutex_lock(&param->mutex);
2401         } else {
2402             qemu_cond_wait(&param->cond, &param->mutex);
2403         }
2404     }
2405     qemu_mutex_unlock(&param->mutex);
2406
2407     return NULL;
2408 }
2409
2410 static void wait_for_decompress_done(void)
2411 {
2412     int idx, thread_count;
2413
2414     if (!migrate_use_compression()) {
2415         return;
2416     }
2417
2418     thread_count = migrate_decompress_threads();
2419     qemu_mutex_lock(&decomp_done_lock);
2420     for (idx = 0; idx < thread_count; idx++) {
2421         while (!decomp_param[idx].done) {
2422             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2423         }
2424     }
2425     qemu_mutex_unlock(&decomp_done_lock);
2426 }
2427
2428 void migrate_decompress_threads_create(void)
2429 {
2430     int i, thread_count;
2431
2432     thread_count = migrate_decompress_threads();
2433     decompress_threads = g_new0(QemuThread, thread_count);
2434     decomp_param = g_new0(DecompressParam, thread_count);
2435     qemu_mutex_init(&decomp_done_lock);
2436     qemu_cond_init(&decomp_done_cond);
2437     for (i = 0; i < thread_count; i++) {
2438         qemu_mutex_init(&decomp_param[i].mutex);
2439         qemu_cond_init(&decomp_param[i].cond);
2440         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2441         decomp_param[i].done = true;
2442         decomp_param[i].quit = false;
2443         qemu_thread_create(decompress_threads + i, "decompress",
2444                            do_data_decompress, decomp_param + i,
2445                            QEMU_THREAD_JOINABLE);
2446     }
2447 }
2448
2449 void migrate_decompress_threads_join(void)
2450 {
2451     int i, thread_count;
2452
2453     thread_count = migrate_decompress_threads();
2454     for (i = 0; i < thread_count; i++) {
2455         qemu_mutex_lock(&decomp_param[i].mutex);
2456         decomp_param[i].quit = true;
2457         qemu_cond_signal(&decomp_param[i].cond);
2458         qemu_mutex_unlock(&decomp_param[i].mutex);
2459     }
2460     for (i = 0; i < thread_count; i++) {
2461         qemu_thread_join(decompress_threads + i);
2462         qemu_mutex_destroy(&decomp_param[i].mutex);
2463         qemu_cond_destroy(&decomp_param[i].cond);
2464         g_free(decomp_param[i].compbuf);
2465     }
2466     g_free(decompress_threads);
2467     g_free(decomp_param);
2468     decompress_threads = NULL;
2469     decomp_param = NULL;
2470 }
2471
2472 static void decompress_data_with_multi_threads(QEMUFile *f,
2473                                                void *host, int len)
2474 {
2475     int idx, thread_count;
2476
2477     thread_count = migrate_decompress_threads();
2478     qemu_mutex_lock(&decomp_done_lock);
2479     while (true) {
2480         for (idx = 0; idx < thread_count; idx++) {
2481             if (decomp_param[idx].done) {
2482                 decomp_param[idx].done = false;
2483                 qemu_mutex_lock(&decomp_param[idx].mutex);
2484                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2485                 decomp_param[idx].des = host;
2486                 decomp_param[idx].len = len;
2487                 qemu_cond_signal(&decomp_param[idx].cond);
2488                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2489                 break;
2490             }
2491         }
2492         if (idx < thread_count) {
2493             break;
2494         } else {
2495             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2496         }
2497     }
2498     qemu_mutex_unlock(&decomp_done_lock);
2499 }
2500
2501 /**
2502  * ram_postcopy_incoming_init: allocate postcopy data structures
2503  *
2504  * Returns 0 for success and negative if there was one error
2505  *
2506  * @mis: current migration incoming state
2507  *
2508  * Allocate data structures etc needed by incoming migration with
2509  * postcopy-ram. postcopy-ram's similarly names
2510  * postcopy_ram_incoming_init does the work.
2511  */
2512 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2513 {
2514     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2515
2516     return postcopy_ram_incoming_init(mis, ram_pages);
2517 }
2518
2519 /**
2520  * ram_load_postcopy: load a page in postcopy case
2521  *
2522  * Returns 0 for success or -errno in case of error
2523  *
2524  * Called in postcopy mode by ram_load().
2525  * rcu_read_lock is taken prior to this being called.
2526  *
2527  * @f: QEMUFile where to send the data
2528  */
2529 static int ram_load_postcopy(QEMUFile *f)
2530 {
2531     int flags = 0, ret = 0;
2532     bool place_needed = false;
2533     bool matching_page_sizes = false;
2534     MigrationIncomingState *mis = migration_incoming_get_current();
2535     /* Temporary page that is later 'placed' */
2536     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2537     void *last_host = NULL;
2538     bool all_zero = false;
2539
2540     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2541         ram_addr_t addr;
2542         void *host = NULL;
2543         void *page_buffer = NULL;
2544         void *place_source = NULL;
2545         RAMBlock *block = NULL;
2546         uint8_t ch;
2547
2548         addr = qemu_get_be64(f);
2549         flags = addr & ~TARGET_PAGE_MASK;
2550         addr &= TARGET_PAGE_MASK;
2551
2552         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2553         place_needed = false;
2554         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2555             block = ram_block_from_stream(f, flags);
2556
2557             host = host_from_ram_block_offset(block, addr);
2558             if (!host) {
2559                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2560                 ret = -EINVAL;
2561                 break;
2562             }
2563             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2564             /*
2565              * Postcopy requires that we place whole host pages atomically;
2566              * these may be huge pages for RAMBlocks that are backed by
2567              * hugetlbfs.
2568              * To make it atomic, the data is read into a temporary page
2569              * that's moved into place later.
2570              * The migration protocol uses,  possibly smaller, target-pages
2571              * however the source ensures it always sends all the components
2572              * of a host page in order.
2573              */
2574             page_buffer = postcopy_host_page +
2575                           ((uintptr_t)host & (block->page_size - 1));
2576             /* If all TP are zero then we can optimise the place */
2577             if (!((uintptr_t)host & (block->page_size - 1))) {
2578                 all_zero = true;
2579             } else {
2580                 /* not the 1st TP within the HP */
2581                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2582                     error_report("Non-sequential target page %p/%p",
2583                                   host, last_host);
2584                     ret = -EINVAL;
2585                     break;
2586                 }
2587             }
2588
2589
2590             /*
2591              * If it's the last part of a host page then we place the host
2592              * page
2593              */
2594             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2595                                      (block->page_size - 1)) == 0;
2596             place_source = postcopy_host_page;
2597         }
2598         last_host = host;
2599
2600         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2601         case RAM_SAVE_FLAG_COMPRESS:
2602             ch = qemu_get_byte(f);
2603             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2604             if (ch) {
2605                 all_zero = false;
2606             }
2607             break;
2608
2609         case RAM_SAVE_FLAG_PAGE:
2610             all_zero = false;
2611             if (!place_needed || !matching_page_sizes) {
2612                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2613             } else {
2614                 /* Avoids the qemu_file copy during postcopy, which is
2615                  * going to do a copy later; can only do it when we
2616                  * do this read in one go (matching page sizes)
2617                  */
2618                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2619                                          TARGET_PAGE_SIZE);
2620             }
2621             break;
2622         case RAM_SAVE_FLAG_EOS:
2623             /* normal exit */
2624             break;
2625         default:
2626             error_report("Unknown combination of migration flags: %#x"
2627                          " (postcopy mode)", flags);
2628             ret = -EINVAL;
2629         }
2630
2631         if (place_needed) {
2632             /* This gets called at the last target page in the host page */
2633             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2634
2635             if (all_zero) {
2636                 ret = postcopy_place_page_zero(mis, place_dest,
2637                                                block->page_size);
2638             } else {
2639                 ret = postcopy_place_page(mis, place_dest,
2640                                           place_source, block->page_size);
2641             }
2642         }
2643         if (!ret) {
2644             ret = qemu_file_get_error(f);
2645         }
2646     }
2647
2648     return ret;
2649 }
2650
2651 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2652 {
2653     int flags = 0, ret = 0;
2654     static uint64_t seq_iter;
2655     int len = 0;
2656     /*
2657      * If system is running in postcopy mode, page inserts to host memory must
2658      * be atomic
2659      */
2660     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2661     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2662     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2663
2664     seq_iter++;
2665
2666     if (version_id != 4) {
2667         ret = -EINVAL;
2668     }
2669
2670     /* This RCU critical section can be very long running.
2671      * When RCU reclaims in the code start to become numerous,
2672      * it will be necessary to reduce the granularity of this
2673      * critical section.
2674      */
2675     rcu_read_lock();
2676
2677     if (postcopy_running) {
2678         ret = ram_load_postcopy(f);
2679     }
2680
2681     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2682         ram_addr_t addr, total_ram_bytes;
2683         void *host = NULL;
2684         uint8_t ch;
2685
2686         addr = qemu_get_be64(f);
2687         flags = addr & ~TARGET_PAGE_MASK;
2688         addr &= TARGET_PAGE_MASK;
2689
2690         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2691                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2692             RAMBlock *block = ram_block_from_stream(f, flags);
2693
2694             host = host_from_ram_block_offset(block, addr);
2695             if (!host) {
2696                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2697                 ret = -EINVAL;
2698                 break;
2699             }
2700         }
2701
2702         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2703         case RAM_SAVE_FLAG_MEM_SIZE:
2704             /* Synchronize RAM block list */
2705             total_ram_bytes = addr;
2706             while (!ret && total_ram_bytes) {
2707                 RAMBlock *block;
2708                 char id[256];
2709                 ram_addr_t length;
2710
2711                 len = qemu_get_byte(f);
2712                 qemu_get_buffer(f, (uint8_t *)id, len);
2713                 id[len] = 0;
2714                 length = qemu_get_be64(f);
2715
2716                 block = qemu_ram_block_by_name(id);
2717                 if (block) {
2718                     if (length != block->used_length) {
2719                         Error *local_err = NULL;
2720
2721                         ret = qemu_ram_resize(block, length,
2722                                               &local_err);
2723                         if (local_err) {
2724                             error_report_err(local_err);
2725                         }
2726                     }
2727                     /* For postcopy we need to check hugepage sizes match */
2728                     if (postcopy_advised &&
2729                         block->page_size != qemu_host_page_size) {
2730                         uint64_t remote_page_size = qemu_get_be64(f);
2731                         if (remote_page_size != block->page_size) {
2732                             error_report("Mismatched RAM page size %s "
2733                                          "(local) %zd != %" PRId64,
2734                                          id, block->page_size,
2735                                          remote_page_size);
2736                             ret = -EINVAL;
2737                         }
2738                     }
2739                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2740                                           block->idstr);
2741                 } else {
2742                     error_report("Unknown ramblock \"%s\", cannot "
2743                                  "accept migration", id);
2744                     ret = -EINVAL;
2745                 }
2746
2747                 total_ram_bytes -= length;
2748             }
2749             break;
2750
2751         case RAM_SAVE_FLAG_COMPRESS:
2752             ch = qemu_get_byte(f);
2753             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2754             break;
2755
2756         case RAM_SAVE_FLAG_PAGE:
2757             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2758             break;
2759
2760         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2761             len = qemu_get_be32(f);
2762             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2763                 error_report("Invalid compressed data length: %d", len);
2764                 ret = -EINVAL;
2765                 break;
2766             }
2767             decompress_data_with_multi_threads(f, host, len);
2768             break;
2769
2770         case RAM_SAVE_FLAG_XBZRLE:
2771             if (load_xbzrle(f, addr, host) < 0) {
2772                 error_report("Failed to decompress XBZRLE page at "
2773                              RAM_ADDR_FMT, addr);
2774                 ret = -EINVAL;
2775                 break;
2776             }
2777             break;
2778         case RAM_SAVE_FLAG_EOS:
2779             /* normal exit */
2780             break;
2781         default:
2782             if (flags & RAM_SAVE_FLAG_HOOK) {
2783                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2784             } else {
2785                 error_report("Unknown combination of migration flags: %#x",
2786                              flags);
2787                 ret = -EINVAL;
2788             }
2789         }
2790         if (!ret) {
2791             ret = qemu_file_get_error(f);
2792         }
2793     }
2794
2795     wait_for_decompress_done();
2796     rcu_read_unlock();
2797     trace_ram_load_complete(ret, seq_iter);
2798     return ret;
2799 }
2800
2801 static SaveVMHandlers savevm_ram_handlers = {
2802     .save_live_setup = ram_save_setup,
2803     .save_live_iterate = ram_save_iterate,
2804     .save_live_complete_postcopy = ram_save_complete,
2805     .save_live_complete_precopy = ram_save_complete,
2806     .save_live_pending = ram_save_pending,
2807     .load_state = ram_load,
2808     .cleanup = ram_migration_cleanup,
2809 };
2810
2811 void ram_mig_init(void)
2812 {
2813     qemu_mutex_init(&XBZRLE.lock);
2814     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2815 }