migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <[email protected]>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "xbzrle.h"
  39 #include "ram.h"
  40 #include "migration/migration.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "migration/vmstate.h"
  44 #include "postcopy-ram.h"
  45 #include "exec/address-spaces.h"
  46 #include "migration/page_cache.h"
  47 #include "qemu/error-report.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52
  53 /***********************************************************/
  54 /* ram save/restore */
  55
  56 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  57  * worked for pages that where filled with the same char.  We switched
  58  * it to only search for the zero value.  And to avoid confusion with
  59  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  60  */
  61
  62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63 #define RAM_SAVE_FLAG_ZERO     0x02
  64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65 #define RAM_SAVE_FLAG_PAGE     0x08
  66 #define RAM_SAVE_FLAG_EOS      0x10
  67 #define RAM_SAVE_FLAG_CONTINUE 0x20
  68 #define RAM_SAVE_FLAG_XBZRLE   0x40
  69 /* 0x80 is reserved in migration.h start with 0x100 next */
  70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72 static uint8_t *ZERO_TARGET_PAGE;
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 /* struct contains XBZRLE cache and a static page
  80    used by the compression */
  81 static struct {
  82     /* buffer used for XBZRLE encoding */
  83     uint8_t *encoded_buf;
  84     /* buffer for storing page content */
  85     uint8_t *current_buf;
  86     /* Cache for XBZRLE, Protected by lock. */
  87     PageCache *cache;
  88     QemuMutex lock;
  89 } XBZRLE;
  90
  91 /* buffer used for XBZRLE decoding */
  92 static uint8_t *xbzrle_decoded_buf;
  93
  94 static void XBZRLE_cache_lock(void)
  95 {
  96     if (migrate_use_xbzrle())
  97         qemu_mutex_lock(&XBZRLE.lock);
  98 }
  99
 100 static void XBZRLE_cache_unlock(void)
 101 {
 102     if (migrate_use_xbzrle())
 103         qemu_mutex_unlock(&XBZRLE.lock);
 104 }
 105
 106 /**
 107  * xbzrle_cache_resize: resize the xbzrle cache
 108  *
 109  * This function is called from qmp_migrate_set_cache_size in main
 110  * thread, possibly while a migration is in progress.  A running
 111  * migration may be using the cache and might finish during this call,
 112  * hence changes to the cache are protected by XBZRLE.lock().
 113  *
 114  * Returns the new_size or negative in case of error.
 115  *
 116  * @new_size: new cache size
 117  */
 118 int64_t xbzrle_cache_resize(int64_t new_size)
 119 {
 120     PageCache *new_cache;
 121     int64_t ret;
 122
 123     if (new_size < TARGET_PAGE_SIZE) {
 124         return -1;
 125     }
 126
 127     XBZRLE_cache_lock();
 128
 129     if (XBZRLE.cache != NULL) {
 130         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 131             goto out_new_size;
 132         }
 133         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 134                                         TARGET_PAGE_SIZE);
 135         if (!new_cache) {
 136             error_report("Error creating cache");
 137             ret = -1;
 138             goto out;
 139         }
 140
 141         cache_fini(XBZRLE.cache);
 142         XBZRLE.cache = new_cache;
 143     }
 144
 145 out_new_size:
 146     ret = pow2floor(new_size);
 147 out:
 148     XBZRLE_cache_unlock();
 149     return ret;
 150 }
 151
 152 /*
 153  * An outstanding page request, on the source, having been received
 154  * and queued
 155  */
 156 struct RAMSrcPageRequest {
 157     RAMBlock *rb;
 158     hwaddr    offset;
 159     hwaddr    len;
 160
 161     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 162 };
 163
 164 /* State of RAM for migration */
 165 struct RAMState {
 166     /* QEMUFile used for this migration */
 167     QEMUFile *f;
 168     /* Last block that we have visited searching for dirty pages */
 169     RAMBlock *last_seen_block;
 170     /* Last block from where we have sent data */
 171     RAMBlock *last_sent_block;
 172     /* Last dirty target page we have sent */
 173     ram_addr_t last_page;
 174     /* last ram version we have seen */
 175     uint32_t last_version;
 176     /* We are in the first round */
 177     bool ram_bulk_stage;
 178     /* How many times we have dirty too many pages */
 179     int dirty_rate_high_cnt;
 180     /* How many times we have synchronized the bitmap */
 181     uint64_t bitmap_sync_count;
 182     /* these variables are used for bitmap sync */
 183     /* last time we did a full bitmap_sync */
 184     int64_t time_last_bitmap_sync;
 185     /* bytes transferred at start_time */
 186     uint64_t bytes_xfer_prev;
 187     /* number of dirty pages since start_time */
 188     uint64_t num_dirty_pages_period;
 189     /* xbzrle misses since the beginning of the period */
 190     uint64_t xbzrle_cache_miss_prev;
 191     /* number of iterations at the beginning of period */
 192     uint64_t iterations_prev;
 193     /* Accounting fields */
 194     /* number of zero pages.  It used to be pages filled by the same char. */
 195     uint64_t zero_pages;
 196     /* number of normal transferred pages */
 197     uint64_t norm_pages;
 198     /* Iterations since start */
 199     uint64_t iterations;
 200     /* xbzrle transmitted bytes.  Notice that this is with
 201      * compression, they can't be calculated from the pages */
 202     uint64_t xbzrle_bytes;
 203     /* xbzrle transmmited pages */
 204     uint64_t xbzrle_pages;
 205     /* xbzrle number of cache miss */
 206     uint64_t xbzrle_cache_miss;
 207     /* xbzrle miss rate */
 208     double xbzrle_cache_miss_rate;
 209     /* xbzrle number of overflows */
 210     uint64_t xbzrle_overflows;
 211     /* number of dirty bits in the bitmap */
 212     uint64_t migration_dirty_pages;
 213     /* total number of bytes transferred */
 214     uint64_t bytes_transferred;
 215     /* number of dirtied pages in the last second */
 216     uint64_t dirty_pages_rate;
 217     /* Count of requests incoming from destination */
 218     uint64_t postcopy_requests;
 219     /* protects modification of the bitmap */
 220     QemuMutex bitmap_mutex;
 221     /* The RAMBlock used in the last src_page_requests */
 222     RAMBlock *last_req_rb;
 223     /* Queue of outstanding page requests from the destination */
 224     QemuMutex src_page_req_mutex;
 225     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 226 };
 227 typedef struct RAMState RAMState;
 228
 229 static RAMState ram_state;
 230
 231 uint64_t dup_mig_pages_transferred(void)
 232 {
 233     return ram_state.zero_pages;
 234 }
 235
 236 uint64_t norm_mig_pages_transferred(void)
 237 {
 238     return ram_state.norm_pages;
 239 }
 240
 241 uint64_t xbzrle_mig_bytes_transferred(void)
 242 {
 243     return ram_state.xbzrle_bytes;
 244 }
 245
 246 uint64_t xbzrle_mig_pages_transferred(void)
 247 {
 248     return ram_state.xbzrle_pages;
 249 }
 250
 251 uint64_t xbzrle_mig_pages_cache_miss(void)
 252 {
 253     return ram_state.xbzrle_cache_miss;
 254 }
 255
 256 double xbzrle_mig_cache_miss_rate(void)
 257 {
 258     return ram_state.xbzrle_cache_miss_rate;
 259 }
 260
 261 uint64_t xbzrle_mig_pages_overflow(void)
 262 {
 263     return ram_state.xbzrle_overflows;
 264 }
 265
 266 uint64_t ram_bytes_transferred(void)
 267 {
 268     return ram_state.bytes_transferred;
 269 }
 270
 271 uint64_t ram_bytes_remaining(void)
 272 {
 273     return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
 274 }
 275
 276 uint64_t ram_dirty_sync_count(void)
 277 {
 278     return ram_state.bitmap_sync_count;
 279 }
 280
 281 uint64_t ram_dirty_pages_rate(void)
 282 {
 283     return ram_state.dirty_pages_rate;
 284 }
 285
 286 uint64_t ram_postcopy_requests(void)
 287 {
 288     return ram_state.postcopy_requests;
 289 }
 290
 291 /* used by the search for pages to send */
 292 struct PageSearchStatus {
 293     /* Current block being searched */
 294     RAMBlock    *block;
 295     /* Current page to search from */
 296     unsigned long page;
 297     /* Set once we wrap around */
 298     bool         complete_round;
 299 };
 300 typedef struct PageSearchStatus PageSearchStatus;
 301
 302 struct CompressParam {
 303     bool done;
 304     bool quit;
 305     QEMUFile *file;
 306     QemuMutex mutex;
 307     QemuCond cond;
 308     RAMBlock *block;
 309     ram_addr_t offset;
 310 };
 311 typedef struct CompressParam CompressParam;
 312
 313 struct DecompressParam {
 314     bool done;
 315     bool quit;
 316     QemuMutex mutex;
 317     QemuCond cond;
 318     void *des;
 319     uint8_t *compbuf;
 320     int len;
 321 };
 322 typedef struct DecompressParam DecompressParam;
 323
 324 static CompressParam *comp_param;
 325 static QemuThread *compress_threads;
 326 /* comp_done_cond is used to wake up the migration thread when
 327  * one of the compression threads has finished the compression.
 328  * comp_done_lock is used to co-work with comp_done_cond.
 329  */
 330 static QemuMutex comp_done_lock;
 331 static QemuCond comp_done_cond;
 332 /* The empty QEMUFileOps will be used by file in CompressParam */
 333 static const QEMUFileOps empty_ops = { };
 334
 335 static DecompressParam *decomp_param;
 336 static QemuThread *decompress_threads;
 337 static QemuMutex decomp_done_lock;
 338 static QemuCond decomp_done_cond;
 339
 340 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 341                                 ram_addr_t offset);
 342
 343 static void *do_data_compress(void *opaque)
 344 {
 345     CompressParam *param = opaque;
 346     RAMBlock *block;
 347     ram_addr_t offset;
 348
 349     qemu_mutex_lock(&param->mutex);
 350     while (!param->quit) {
 351         if (param->block) {
 352             block = param->block;
 353             offset = param->offset;
 354             param->block = NULL;
 355             qemu_mutex_unlock(&param->mutex);
 356
 357             do_compress_ram_page(param->file, block, offset);
 358
 359             qemu_mutex_lock(&comp_done_lock);
 360             param->done = true;
 361             qemu_cond_signal(&comp_done_cond);
 362             qemu_mutex_unlock(&comp_done_lock);
 363
 364             qemu_mutex_lock(&param->mutex);
 365         } else {
 366             qemu_cond_wait(&param->cond, &param->mutex);
 367         }
 368     }
 369     qemu_mutex_unlock(&param->mutex);
 370
 371     return NULL;
 372 }
 373
 374 static inline void terminate_compression_threads(void)
 375 {
 376     int idx, thread_count;
 377
 378     thread_count = migrate_compress_threads();
 379
 380     for (idx = 0; idx < thread_count; idx++) {
 381         qemu_mutex_lock(&comp_param[idx].mutex);
 382         comp_param[idx].quit = true;
 383         qemu_cond_signal(&comp_param[idx].cond);
 384         qemu_mutex_unlock(&comp_param[idx].mutex);
 385     }
 386 }
 387
 388 void migrate_compress_threads_join(void)
 389 {
 390     int i, thread_count;
 391
 392     if (!migrate_use_compression()) {
 393         return;
 394     }
 395     terminate_compression_threads();
 396     thread_count = migrate_compress_threads();
 397     for (i = 0; i < thread_count; i++) {
 398         qemu_thread_join(compress_threads + i);
 399         qemu_fclose(comp_param[i].file);
 400         qemu_mutex_destroy(&comp_param[i].mutex);
 401         qemu_cond_destroy(&comp_param[i].cond);
 402     }
 403     qemu_mutex_destroy(&comp_done_lock);
 404     qemu_cond_destroy(&comp_done_cond);
 405     g_free(compress_threads);
 406     g_free(comp_param);
 407     compress_threads = NULL;
 408     comp_param = NULL;
 409 }
 410
 411 void migrate_compress_threads_create(void)
 412 {
 413     int i, thread_count;
 414
 415     if (!migrate_use_compression()) {
 416         return;
 417     }
 418     thread_count = migrate_compress_threads();
 419     compress_threads = g_new0(QemuThread, thread_count);
 420     comp_param = g_new0(CompressParam, thread_count);
 421     qemu_cond_init(&comp_done_cond);
 422     qemu_mutex_init(&comp_done_lock);
 423     for (i = 0; i < thread_count; i++) {
 424         /* comp_param[i].file is just used as a dummy buffer to save data,
 425          * set its ops to empty.
 426          */
 427         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 428         comp_param[i].done = true;
 429         comp_param[i].quit = false;
 430         qemu_mutex_init(&comp_param[i].mutex);
 431         qemu_cond_init(&comp_param[i].cond);
 432         qemu_thread_create(compress_threads + i, "compress",
 433                            do_data_compress, comp_param + i,
 434                            QEMU_THREAD_JOINABLE);
 435     }
 436 }
 437
 438 /**
 439  * save_page_header: write page header to wire
 440  *
 441  * If this is the 1st block, it also writes the block identification
 442  *
 443  * Returns the number of bytes written
 444  *
 445  * @f: QEMUFile where to send the data
 446  * @block: block that contains the page we want to send
 447  * @offset: offset inside the block for the page
 448  *          in the lower bits, it contains flags
 449  */
 450 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 451                                ram_addr_t offset)
 452 {
 453     size_t size, len;
 454
 455     if (block == rs->last_sent_block) {
 456         offset |= RAM_SAVE_FLAG_CONTINUE;
 457     }
 458     qemu_put_be64(f, offset);
 459     size = 8;
 460
 461     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 462         len = strlen(block->idstr);
 463         qemu_put_byte(f, len);
 464         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 465         size += 1 + len;
 466         rs->last_sent_block = block;
 467     }
 468     return size;
 469 }
 470
 471 /**
 472  * mig_throttle_guest_down: throotle down the guest
 473  *
 474  * Reduce amount of guest cpu execution to hopefully slow down memory
 475  * writes. If guest dirty memory rate is reduced below the rate at
 476  * which we can transfer pages to the destination then we should be
 477  * able to complete migration. Some workloads dirty memory way too
 478  * fast and will not effectively converge, even with auto-converge.
 479  */
 480 static void mig_throttle_guest_down(void)
 481 {
 482     MigrationState *s = migrate_get_current();
 483     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 484     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 485
 486     /* We have not started throttling yet. Let's start it. */
 487     if (!cpu_throttle_active()) {
 488         cpu_throttle_set(pct_initial);
 489     } else {
 490         /* Throttling already on, just increase the rate */
 491         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 492     }
 493 }
 494
 495 /**
 496  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 497  *
 498  * @rs: current RAM state
 499  * @current_addr: address for the zero page
 500  *
 501  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 502  * The important thing is that a stale (not-yet-0'd) page be replaced
 503  * by the new data.
 504  * As a bonus, if the page wasn't in the cache it gets added so that
 505  * when a small write is made into the 0'd page it gets XBZRLE sent.
 506  */
 507 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 508 {
 509     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 510         return;
 511     }
 512
 513     /* We don't care if this fails to allocate a new cache page
 514      * as long as it updated an old one */
 515     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 516                  rs->bitmap_sync_count);
 517 }
 518
 519 #define ENCODING_FLAG_XBZRLE 0x1
 520
 521 /**
 522  * save_xbzrle_page: compress and send current page
 523  *
 524  * Returns: 1 means that we wrote the page
 525  *          0 means that page is identical to the one already sent
 526  *          -1 means that xbzrle would be longer than normal
 527  *
 528  * @rs: current RAM state
 529  * @current_data: pointer to the address of the page contents
 530  * @current_addr: addr of the page
 531  * @block: block that contains the page we want to send
 532  * @offset: offset inside the block for the page
 533  * @last_stage: if we are at the completion stage
 534  */
 535 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 536                             ram_addr_t current_addr, RAMBlock *block,
 537                             ram_addr_t offset, bool last_stage)
 538 {
 539     int encoded_len = 0, bytes_xbzrle;
 540     uint8_t *prev_cached_page;
 541
 542     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 543         rs->xbzrle_cache_miss++;
 544         if (!last_stage) {
 545             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 546                              rs->bitmap_sync_count) == -1) {
 547                 return -1;
 548             } else {
 549                 /* update *current_data when the page has been
 550                    inserted into cache */
 551                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 552             }
 553         }
 554         return -1;
 555     }
 556
 557     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 558
 559     /* save current buffer into memory */
 560     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 561
 562     /* XBZRLE encoding (if there is no overflow) */
 563     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 564                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 565                                        TARGET_PAGE_SIZE);
 566     if (encoded_len == 0) {
 567         trace_save_xbzrle_page_skipping();
 568         return 0;
 569     } else if (encoded_len == -1) {
 570         trace_save_xbzrle_page_overflow();
 571         rs->xbzrle_overflows++;
 572         /* update data in the cache */
 573         if (!last_stage) {
 574             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 575             *current_data = prev_cached_page;
 576         }
 577         return -1;
 578     }
 579
 580     /* we need to update the data in the cache, in order to get the same data */
 581     if (!last_stage) {
 582         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 583     }
 584
 585     /* Send XBZRLE based compressed page */
 586     bytes_xbzrle = save_page_header(rs, rs->f, block,
 587                                     offset | RAM_SAVE_FLAG_XBZRLE);
 588     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 589     qemu_put_be16(rs->f, encoded_len);
 590     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 591     bytes_xbzrle += encoded_len + 1 + 2;
 592     rs->xbzrle_pages++;
 593     rs->xbzrle_bytes += bytes_xbzrle;
 594     rs->bytes_transferred += bytes_xbzrle;
 595
 596     return 1;
 597 }
 598
 599 /**
 600  * migration_bitmap_find_dirty: find the next dirty page from start
 601  *
 602  * Called with rcu_read_lock() to protect migration_bitmap
 603  *
 604  * Returns the byte offset within memory region of the start of a dirty page
 605  *
 606  * @rs: current RAM state
 607  * @rb: RAMBlock where to search for dirty pages
 608  * @start: page where we start the search
 609  */
 610 static inline
 611 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 612                                           unsigned long start)
 613 {
 614     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 615     unsigned long *bitmap = rb->bmap;
 616     unsigned long next;
 617
 618     if (rs->ram_bulk_stage && start > 0) {
 619         next = start + 1;
 620     } else {
 621         next = find_next_bit(bitmap, size, start);
 622     }
 623
 624     return next;
 625 }
 626
 627 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 628                                                 RAMBlock *rb,
 629                                                 unsigned long page)
 630 {
 631     bool ret;
 632
 633     ret = test_and_clear_bit(page, rb->bmap);
 634
 635     if (ret) {
 636         rs->migration_dirty_pages--;
 637     }
 638     return ret;
 639 }
 640
 641 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 642                                         ram_addr_t start, ram_addr_t length)
 643 {
 644     rs->migration_dirty_pages +=
 645         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 646                                               &rs->num_dirty_pages_period);
 647 }
 648
 649 /**
 650  * ram_pagesize_summary: calculate all the pagesizes of a VM
 651  *
 652  * Returns a summary bitmap of the page sizes of all RAMBlocks
 653  *
 654  * For VMs with just normal pages this is equivalent to the host page
 655  * size. If it's got some huge pages then it's the OR of all the
 656  * different page sizes.
 657  */
 658 uint64_t ram_pagesize_summary(void)
 659 {
 660     RAMBlock *block;
 661     uint64_t summary = 0;
 662
 663     RAMBLOCK_FOREACH(block) {
 664         summary |= block->page_size;
 665     }
 666
 667     return summary;
 668 }
 669
 670 static void migration_bitmap_sync(RAMState *rs)
 671 {
 672     RAMBlock *block;
 673     int64_t end_time;
 674     uint64_t bytes_xfer_now;
 675
 676     rs->bitmap_sync_count++;
 677
 678     if (!rs->time_last_bitmap_sync) {
 679         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 680     }
 681
 682     trace_migration_bitmap_sync_start();
 683     memory_global_dirty_log_sync();
 684
 685     qemu_mutex_lock(&rs->bitmap_mutex);
 686     rcu_read_lock();
 687     RAMBLOCK_FOREACH(block) {
 688         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 689     }
 690     rcu_read_unlock();
 691     qemu_mutex_unlock(&rs->bitmap_mutex);
 692
 693     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 694
 695     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 696
 697     /* more than 1 second = 1000 millisecons */
 698     if (end_time > rs->time_last_bitmap_sync + 1000) {
 699         /* calculate period counters */
 700         rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 701             / (end_time - rs->time_last_bitmap_sync);
 702         bytes_xfer_now = ram_bytes_transferred();
 703
 704         if (migrate_auto_converge()) {
 705             /* The following detection logic can be refined later. For now:
 706                Check to see if the dirtied bytes is 50% more than the approx.
 707                amount of bytes that just got transferred since the last time we
 708                were in this routine. If that happens twice, start or increase
 709                throttling */
 710
 711             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 712                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 713                 (++rs->dirty_rate_high_cnt >= 2)) {
 714                     trace_migration_throttle();
 715                     rs->dirty_rate_high_cnt = 0;
 716                     mig_throttle_guest_down();
 717             }
 718         }
 719
 720         if (migrate_use_xbzrle()) {
 721             if (rs->iterations_prev != rs->iterations) {
 722                 rs->xbzrle_cache_miss_rate =
 723                    (double)(rs->xbzrle_cache_miss -
 724                             rs->xbzrle_cache_miss_prev) /
 725                    (rs->iterations - rs->iterations_prev);
 726             }
 727             rs->iterations_prev = rs->iterations;
 728             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
 729         }
 730
 731         /* reset period counters */
 732         rs->time_last_bitmap_sync = end_time;
 733         rs->num_dirty_pages_period = 0;
 734         rs->bytes_xfer_prev = bytes_xfer_now;
 735     }
 736     if (migrate_use_events()) {
 737         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 738     }
 739 }
 740
 741 /**
 742  * save_zero_page: send the zero page to the stream
 743  *
 744  * Returns the number of pages written.
 745  *
 746  * @rs: current RAM state
 747  * @block: block that contains the page we want to send
 748  * @offset: offset inside the block for the page
 749  * @p: pointer to the page
 750  */
 751 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 752                           uint8_t *p)
 753 {
 754     int pages = -1;
 755
 756     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 757         rs->zero_pages++;
 758         rs->bytes_transferred +=
 759             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 760         qemu_put_byte(rs->f, 0);
 761         rs->bytes_transferred += 1;
 762         pages = 1;
 763     }
 764
 765     return pages;
 766 }
 767
 768 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 769 {
 770     if (!migrate_release_ram() || !migration_in_postcopy()) {
 771         return;
 772     }
 773
 774     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 775 }
 776
 777 /**
 778  * ram_save_page: send the given page to the stream
 779  *
 780  * Returns the number of pages written.
 781  *          < 0 - error
 782  *          >=0 - Number of pages written - this might legally be 0
 783  *                if xbzrle noticed the page was the same.
 784  *
 785  * @rs: current RAM state
 786  * @block: block that contains the page we want to send
 787  * @offset: offset inside the block for the page
 788  * @last_stage: if we are at the completion stage
 789  */
 790 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 791 {
 792     int pages = -1;
 793     uint64_t bytes_xmit;
 794     ram_addr_t current_addr;
 795     uint8_t *p;
 796     int ret;
 797     bool send_async = true;
 798     RAMBlock *block = pss->block;
 799     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 800
 801     p = block->host + offset;
 802     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 803
 804     /* In doubt sent page as normal */
 805     bytes_xmit = 0;
 806     ret = ram_control_save_page(rs->f, block->offset,
 807                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 808     if (bytes_xmit) {
 809         rs->bytes_transferred += bytes_xmit;
 810         pages = 1;
 811     }
 812
 813     XBZRLE_cache_lock();
 814
 815     current_addr = block->offset + offset;
 816
 817     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 818         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 819             if (bytes_xmit > 0) {
 820                 rs->norm_pages++;
 821             } else if (bytes_xmit == 0) {
 822                 rs->zero_pages++;
 823             }
 824         }
 825     } else {
 826         pages = save_zero_page(rs, block, offset, p);
 827         if (pages > 0) {
 828             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 829              * page would be stale
 830              */
 831             xbzrle_cache_zero_page(rs, current_addr);
 832             ram_release_pages(block->idstr, offset, pages);
 833         } else if (!rs->ram_bulk_stage &&
 834                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 835             pages = save_xbzrle_page(rs, &p, current_addr, block,
 836                                      offset, last_stage);
 837             if (!last_stage) {
 838                 /* Can't send this cached data async, since the cache page
 839                  * might get updated before it gets to the wire
 840                  */
 841                 send_async = false;
 842             }
 843         }
 844     }
 845
 846     /* XBZRLE overflow or normal page */
 847     if (pages == -1) {
 848         rs->bytes_transferred += save_page_header(rs, rs->f, block,
 849                                                   offset | RAM_SAVE_FLAG_PAGE);
 850         if (send_async) {
 851             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 852                                   migrate_release_ram() &
 853                                   migration_in_postcopy());
 854         } else {
 855             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 856         }
 857         rs->bytes_transferred += TARGET_PAGE_SIZE;
 858         pages = 1;
 859         rs->norm_pages++;
 860     }
 861
 862     XBZRLE_cache_unlock();
 863
 864     return pages;
 865 }
 866
 867 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 868                                 ram_addr_t offset)
 869 {
 870     RAMState *rs = &ram_state;
 871     int bytes_sent, blen;
 872     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 873
 874     bytes_sent = save_page_header(rs, f, block, offset |
 875                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 876     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 877                                      migrate_compress_level());
 878     if (blen < 0) {
 879         bytes_sent = 0;
 880         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 881         error_report("compressed data failed!");
 882     } else {
 883         bytes_sent += blen;
 884         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 885     }
 886
 887     return bytes_sent;
 888 }
 889
 890 static void flush_compressed_data(RAMState *rs)
 891 {
 892     int idx, len, thread_count;
 893
 894     if (!migrate_use_compression()) {
 895         return;
 896     }
 897     thread_count = migrate_compress_threads();
 898
 899     qemu_mutex_lock(&comp_done_lock);
 900     for (idx = 0; idx < thread_count; idx++) {
 901         while (!comp_param[idx].done) {
 902             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 903         }
 904     }
 905     qemu_mutex_unlock(&comp_done_lock);
 906
 907     for (idx = 0; idx < thread_count; idx++) {
 908         qemu_mutex_lock(&comp_param[idx].mutex);
 909         if (!comp_param[idx].quit) {
 910             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 911             rs->bytes_transferred += len;
 912         }
 913         qemu_mutex_unlock(&comp_param[idx].mutex);
 914     }
 915 }
 916
 917 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 918                                        ram_addr_t offset)
 919 {
 920     param->block = block;
 921     param->offset = offset;
 922 }
 923
 924 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 925                                            ram_addr_t offset)
 926 {
 927     int idx, thread_count, bytes_xmit = -1, pages = -1;
 928
 929     thread_count = migrate_compress_threads();
 930     qemu_mutex_lock(&comp_done_lock);
 931     while (true) {
 932         for (idx = 0; idx < thread_count; idx++) {
 933             if (comp_param[idx].done) {
 934                 comp_param[idx].done = false;
 935                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 936                 qemu_mutex_lock(&comp_param[idx].mutex);
 937                 set_compress_params(&comp_param[idx], block, offset);
 938                 qemu_cond_signal(&comp_param[idx].cond);
 939                 qemu_mutex_unlock(&comp_param[idx].mutex);
 940                 pages = 1;
 941                 rs->norm_pages++;
 942                 rs->bytes_transferred += bytes_xmit;
 943                 break;
 944             }
 945         }
 946         if (pages > 0) {
 947             break;
 948         } else {
 949             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 950         }
 951     }
 952     qemu_mutex_unlock(&comp_done_lock);
 953
 954     return pages;
 955 }
 956
 957 /**
 958  * ram_save_compressed_page: compress the given page and send it to the stream
 959  *
 960  * Returns the number of pages written.
 961  *
 962  * @rs: current RAM state
 963  * @block: block that contains the page we want to send
 964  * @offset: offset inside the block for the page
 965  * @last_stage: if we are at the completion stage
 966  */
 967 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 968                                     bool last_stage)
 969 {
 970     int pages = -1;
 971     uint64_t bytes_xmit = 0;
 972     uint8_t *p;
 973     int ret, blen;
 974     RAMBlock *block = pss->block;
 975     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 976
 977     p = block->host + offset;
 978
 979     ret = ram_control_save_page(rs->f, block->offset,
 980                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 981     if (bytes_xmit) {
 982         rs->bytes_transferred += bytes_xmit;
 983         pages = 1;
 984     }
 985     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 986         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 987             if (bytes_xmit > 0) {
 988                 rs->norm_pages++;
 989             } else if (bytes_xmit == 0) {
 990                 rs->zero_pages++;
 991             }
 992         }
 993     } else {
 994         /* When starting the process of a new block, the first page of
 995          * the block should be sent out before other pages in the same
 996          * block, and all the pages in last block should have been sent
 997          * out, keeping this order is important, because the 'cont' flag
 998          * is used to avoid resending the block name.
 999          */
1000         if (block != rs->last_sent_block) {
1001             flush_compressed_data(rs);
1002             pages = save_zero_page(rs, block, offset, p);
1003             if (pages == -1) {
1004                 /* Make sure the first page is sent out before other pages */
1005                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1006                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1007                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1008                                                  migrate_compress_level());
1009                 if (blen > 0) {
1010                     rs->bytes_transferred += bytes_xmit + blen;
1011                     rs->norm_pages++;
1012                     pages = 1;
1013                 } else {
1014                     qemu_file_set_error(rs->f, blen);
1015                     error_report("compressed data failed!");
1016                 }
1017             }
1018             if (pages > 0) {
1019                 ram_release_pages(block->idstr, offset, pages);
1020             }
1021         } else {
1022             pages = save_zero_page(rs, block, offset, p);
1023             if (pages == -1) {
1024                 pages = compress_page_with_multi_thread(rs, block, offset);
1025             } else {
1026                 ram_release_pages(block->idstr, offset, pages);
1027             }
1028         }
1029     }
1030
1031     return pages;
1032 }
1033
1034 /**
1035  * find_dirty_block: find the next dirty page and update any state
1036  * associated with the search process.
1037  *
1038  * Returns if a page is found
1039  *
1040  * @rs: current RAM state
1041  * @pss: data about the state of the current dirty page scan
1042  * @again: set to false if the search has scanned the whole of RAM
1043  */
1044 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1045 {
1046     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1047     if (pss->complete_round && pss->block == rs->last_seen_block &&
1048         pss->page >= rs->last_page) {
1049         /*
1050          * We've been once around the RAM and haven't found anything.
1051          * Give up.
1052          */
1053         *again = false;
1054         return false;
1055     }
1056     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1057         /* Didn't find anything in this RAM Block */
1058         pss->page = 0;
1059         pss->block = QLIST_NEXT_RCU(pss->block, next);
1060         if (!pss->block) {
1061             /* Hit the end of the list */
1062             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1063             /* Flag that we've looped */
1064             pss->complete_round = true;
1065             rs->ram_bulk_stage = false;
1066             if (migrate_use_xbzrle()) {
1067                 /* If xbzrle is on, stop using the data compression at this
1068                  * point. In theory, xbzrle can do better than compression.
1069                  */
1070                 flush_compressed_data(rs);
1071             }
1072         }
1073         /* Didn't find anything this time, but try again on the new block */
1074         *again = true;
1075         return false;
1076     } else {
1077         /* Can go around again, but... */
1078         *again = true;
1079         /* We've found something so probably don't need to */
1080         return true;
1081     }
1082 }
1083
1084 /**
1085  * unqueue_page: gets a page of the queue
1086  *
1087  * Helper for 'get_queued_page' - gets a page off the queue
1088  *
1089  * Returns the block of the page (or NULL if none available)
1090  *
1091  * @rs: current RAM state
1092  * @offset: used to return the offset within the RAMBlock
1093  */
1094 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1095 {
1096     RAMBlock *block = NULL;
1097
1098     qemu_mutex_lock(&rs->src_page_req_mutex);
1099     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1100         struct RAMSrcPageRequest *entry =
1101                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1102         block = entry->rb;
1103         *offset = entry->offset;
1104
1105         if (entry->len > TARGET_PAGE_SIZE) {
1106             entry->len -= TARGET_PAGE_SIZE;
1107             entry->offset += TARGET_PAGE_SIZE;
1108         } else {
1109             memory_region_unref(block->mr);
1110             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1111             g_free(entry);
1112         }
1113     }
1114     qemu_mutex_unlock(&rs->src_page_req_mutex);
1115
1116     return block;
1117 }
1118
1119 /**
1120  * get_queued_page: unqueue a page from the postocpy requests
1121  *
1122  * Skips pages that are already sent (!dirty)
1123  *
1124  * Returns if a queued page is found
1125  *
1126  * @rs: current RAM state
1127  * @pss: data about the state of the current dirty page scan
1128  */
1129 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1130 {
1131     RAMBlock  *block;
1132     ram_addr_t offset;
1133     bool dirty;
1134
1135     do {
1136         block = unqueue_page(rs, &offset);
1137         /*
1138          * We're sending this page, and since it's postcopy nothing else
1139          * will dirty it, and we must make sure it doesn't get sent again
1140          * even if this queue request was received after the background
1141          * search already sent it.
1142          */
1143         if (block) {
1144             unsigned long page;
1145
1146             page = offset >> TARGET_PAGE_BITS;
1147             dirty = test_bit(page, block->bmap);
1148             if (!dirty) {
1149                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1150                        page, test_bit(page, block->unsentmap));
1151             } else {
1152                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1153             }
1154         }
1155
1156     } while (block && !dirty);
1157
1158     if (block) {
1159         /*
1160          * As soon as we start servicing pages out of order, then we have
1161          * to kill the bulk stage, since the bulk stage assumes
1162          * in (migration_bitmap_find_and_reset_dirty) that every page is
1163          * dirty, that's no longer true.
1164          */
1165         rs->ram_bulk_stage = false;
1166
1167         /*
1168          * We want the background search to continue from the queued page
1169          * since the guest is likely to want other pages near to the page
1170          * it just requested.
1171          */
1172         pss->block = block;
1173         pss->page = offset >> TARGET_PAGE_BITS;
1174     }
1175
1176     return !!block;
1177 }
1178
1179 /**
1180  * migration_page_queue_free: drop any remaining pages in the ram
1181  * request queue
1182  *
1183  * It should be empty at the end anyway, but in error cases there may
1184  * be some left.  in case that there is any page left, we drop it.
1185  *
1186  */
1187 static void migration_page_queue_free(RAMState *rs)
1188 {
1189     struct RAMSrcPageRequest *mspr, *next_mspr;
1190     /* This queue generally should be empty - but in the case of a failed
1191      * migration might have some droppings in.
1192      */
1193     rcu_read_lock();
1194     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1195         memory_region_unref(mspr->rb->mr);
1196         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1197         g_free(mspr);
1198     }
1199     rcu_read_unlock();
1200 }
1201
1202 /**
1203  * ram_save_queue_pages: queue the page for transmission
1204  *
1205  * A request from postcopy destination for example.
1206  *
1207  * Returns zero on success or negative on error
1208  *
1209  * @rbname: Name of the RAMBLock of the request. NULL means the
1210  *          same that last one.
1211  * @start: starting address from the start of the RAMBlock
1212  * @len: length (in bytes) to send
1213  */
1214 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1215 {
1216     RAMBlock *ramblock;
1217     RAMState *rs = &ram_state;
1218
1219     rs->postcopy_requests++;
1220     rcu_read_lock();
1221     if (!rbname) {
1222         /* Reuse last RAMBlock */
1223         ramblock = rs->last_req_rb;
1224
1225         if (!ramblock) {
1226             /*
1227              * Shouldn't happen, we can't reuse the last RAMBlock if
1228              * it's the 1st request.
1229              */
1230             error_report("ram_save_queue_pages no previous block");
1231             goto err;
1232         }
1233     } else {
1234         ramblock = qemu_ram_block_by_name(rbname);
1235
1236         if (!ramblock) {
1237             /* We shouldn't be asked for a non-existent RAMBlock */
1238             error_report("ram_save_queue_pages no block '%s'", rbname);
1239             goto err;
1240         }
1241         rs->last_req_rb = ramblock;
1242     }
1243     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1244     if (start+len > ramblock->used_length) {
1245         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1246                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1247                      __func__, start, len, ramblock->used_length);
1248         goto err;
1249     }
1250
1251     struct RAMSrcPageRequest *new_entry =
1252         g_malloc0(sizeof(struct RAMSrcPageRequest));
1253     new_entry->rb = ramblock;
1254     new_entry->offset = start;
1255     new_entry->len = len;
1256
1257     memory_region_ref(ramblock->mr);
1258     qemu_mutex_lock(&rs->src_page_req_mutex);
1259     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1260     qemu_mutex_unlock(&rs->src_page_req_mutex);
1261     rcu_read_unlock();
1262
1263     return 0;
1264
1265 err:
1266     rcu_read_unlock();
1267     return -1;
1268 }
1269
1270 /**
1271  * ram_save_target_page: save one target page
1272  *
1273  * Returns the number of pages written
1274  *
1275  * @rs: current RAM state
1276  * @ms: current migration state
1277  * @pss: data about the page we want to send
1278  * @last_stage: if we are at the completion stage
1279  */
1280 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1281                                 bool last_stage)
1282 {
1283     int res = 0;
1284
1285     /* Check the pages is dirty and if it is send it */
1286     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1287         /*
1288          * If xbzrle is on, stop using the data compression after first
1289          * round of migration even if compression is enabled. In theory,
1290          * xbzrle can do better than compression.
1291          */
1292         if (migrate_use_compression() &&
1293             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1294             res = ram_save_compressed_page(rs, pss, last_stage);
1295         } else {
1296             res = ram_save_page(rs, pss, last_stage);
1297         }
1298
1299         if (res < 0) {
1300             return res;
1301         }
1302         if (pss->block->unsentmap) {
1303             clear_bit(pss->page, pss->block->unsentmap);
1304         }
1305     }
1306
1307     return res;
1308 }
1309
1310 /**
1311  * ram_save_host_page: save a whole host page
1312  *
1313  * Starting at *offset send pages up to the end of the current host
1314  * page. It's valid for the initial offset to point into the middle of
1315  * a host page in which case the remainder of the hostpage is sent.
1316  * Only dirty target pages are sent. Note that the host page size may
1317  * be a huge page for this block.
1318  * The saving stops at the boundary of the used_length of the block
1319  * if the RAMBlock isn't a multiple of the host page size.
1320  *
1321  * Returns the number of pages written or negative on error
1322  *
1323  * @rs: current RAM state
1324  * @ms: current migration state
1325  * @pss: data about the page we want to send
1326  * @last_stage: if we are at the completion stage
1327  */
1328 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1329                               bool last_stage)
1330 {
1331     int tmppages, pages = 0;
1332     size_t pagesize_bits =
1333         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1334
1335     do {
1336         tmppages = ram_save_target_page(rs, pss, last_stage);
1337         if (tmppages < 0) {
1338             return tmppages;
1339         }
1340
1341         pages += tmppages;
1342         pss->page++;
1343     } while ((pss->page & (pagesize_bits - 1)) &&
1344              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1345
1346     /* The offset we leave with is the last one we looked at */
1347     pss->page--;
1348     return pages;
1349 }
1350
1351 /**
1352  * ram_find_and_save_block: finds a dirty page and sends it to f
1353  *
1354  * Called within an RCU critical section.
1355  *
1356  * Returns the number of pages written where zero means no dirty pages
1357  *
1358  * @rs: current RAM state
1359  * @last_stage: if we are at the completion stage
1360  *
1361  * On systems where host-page-size > target-page-size it will send all the
1362  * pages in a host page that are dirty.
1363  */
1364
1365 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1366 {
1367     PageSearchStatus pss;
1368     int pages = 0;
1369     bool again, found;
1370
1371     /* No dirty page as there is zero RAM */
1372     if (!ram_bytes_total()) {
1373         return pages;
1374     }
1375
1376     pss.block = rs->last_seen_block;
1377     pss.page = rs->last_page;
1378     pss.complete_round = false;
1379
1380     if (!pss.block) {
1381         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1382     }
1383
1384     do {
1385         again = true;
1386         found = get_queued_page(rs, &pss);
1387
1388         if (!found) {
1389             /* priority queue empty, so just search for something dirty */
1390             found = find_dirty_block(rs, &pss, &again);
1391         }
1392
1393         if (found) {
1394             pages = ram_save_host_page(rs, &pss, last_stage);
1395         }
1396     } while (!pages && again);
1397
1398     rs->last_seen_block = pss.block;
1399     rs->last_page = pss.page;
1400
1401     return pages;
1402 }
1403
1404 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1405 {
1406     uint64_t pages = size / TARGET_PAGE_SIZE;
1407     RAMState *rs = &ram_state;
1408
1409     if (zero) {
1410         rs->zero_pages += pages;
1411     } else {
1412         rs->norm_pages += pages;
1413         rs->bytes_transferred += size;
1414         qemu_update_position(f, size);
1415     }
1416 }
1417
1418 uint64_t ram_bytes_total(void)
1419 {
1420     RAMBlock *block;
1421     uint64_t total = 0;
1422
1423     rcu_read_lock();
1424     RAMBLOCK_FOREACH(block) {
1425         total += block->used_length;
1426     }
1427     rcu_read_unlock();
1428     return total;
1429 }
1430
1431 void free_xbzrle_decoded_buf(void)
1432 {
1433     g_free(xbzrle_decoded_buf);
1434     xbzrle_decoded_buf = NULL;
1435 }
1436
1437 static void ram_migration_cleanup(void *opaque)
1438 {
1439     RAMState *rs = opaque;
1440     RAMBlock *block;
1441
1442     /* caller have hold iothread lock or is in a bh, so there is
1443      * no writing race against this migration_bitmap
1444      */
1445     memory_global_dirty_log_stop();
1446
1447     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1448         g_free(block->bmap);
1449         block->bmap = NULL;
1450         g_free(block->unsentmap);
1451         block->unsentmap = NULL;
1452     }
1453
1454     XBZRLE_cache_lock();
1455     if (XBZRLE.cache) {
1456         cache_fini(XBZRLE.cache);
1457         g_free(XBZRLE.encoded_buf);
1458         g_free(XBZRLE.current_buf);
1459         g_free(ZERO_TARGET_PAGE);
1460         XBZRLE.cache = NULL;
1461         XBZRLE.encoded_buf = NULL;
1462         XBZRLE.current_buf = NULL;
1463     }
1464     XBZRLE_cache_unlock();
1465     migration_page_queue_free(rs);
1466 }
1467
1468 static void ram_state_reset(RAMState *rs)
1469 {
1470     rs->last_seen_block = NULL;
1471     rs->last_sent_block = NULL;
1472     rs->last_page = 0;
1473     rs->last_version = ram_list.version;
1474     rs->ram_bulk_stage = true;
1475 }
1476
1477 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1478
1479 /*
1480  * 'expected' is the value you expect the bitmap mostly to be full
1481  * of; it won't bother printing lines that are all this value.
1482  * If 'todump' is null the migration bitmap is dumped.
1483  */
1484 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1485                            unsigned long pages)
1486 {
1487     int64_t cur;
1488     int64_t linelen = 128;
1489     char linebuf[129];
1490
1491     for (cur = 0; cur < pages; cur += linelen) {
1492         int64_t curb;
1493         bool found = false;
1494         /*
1495          * Last line; catch the case where the line length
1496          * is longer than remaining ram
1497          */
1498         if (cur + linelen > pages) {
1499             linelen = pages - cur;
1500         }
1501         for (curb = 0; curb < linelen; curb++) {
1502             bool thisbit = test_bit(cur + curb, todump);
1503             linebuf[curb] = thisbit ? '1' : '.';
1504             found = found || (thisbit != expected);
1505         }
1506         if (found) {
1507             linebuf[curb] = '\0';
1508             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1509         }
1510     }
1511 }
1512
1513 /* **** functions for postcopy ***** */
1514
1515 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1516 {
1517     struct RAMBlock *block;
1518
1519     RAMBLOCK_FOREACH(block) {
1520         unsigned long *bitmap = block->bmap;
1521         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1522         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1523
1524         while (run_start < range) {
1525             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1526             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1527                               (run_end - run_start) << TARGET_PAGE_BITS);
1528             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1529         }
1530     }
1531 }
1532
1533 /**
1534  * postcopy_send_discard_bm_ram: discard a RAMBlock
1535  *
1536  * Returns zero on success
1537  *
1538  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1539  * Note: At this point the 'unsentmap' is the processed bitmap combined
1540  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1541  *
1542  * @ms: current migration state
1543  * @pds: state for postcopy
1544  * @start: RAMBlock starting page
1545  * @length: RAMBlock size
1546  */
1547 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1548                                         PostcopyDiscardState *pds,
1549                                         RAMBlock *block)
1550 {
1551     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1552     unsigned long current;
1553     unsigned long *unsentmap = block->unsentmap;
1554
1555     for (current = 0; current < end; ) {
1556         unsigned long one = find_next_bit(unsentmap, end, current);
1557
1558         if (one <= end) {
1559             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1560             unsigned long discard_length;
1561
1562             if (zero >= end) {
1563                 discard_length = end - one;
1564             } else {
1565                 discard_length = zero - one;
1566             }
1567             if (discard_length) {
1568                 postcopy_discard_send_range(ms, pds, one, discard_length);
1569             }
1570             current = one + discard_length;
1571         } else {
1572             current = one;
1573         }
1574     }
1575
1576     return 0;
1577 }
1578
1579 /**
1580  * postcopy_each_ram_send_discard: discard all RAMBlocks
1581  *
1582  * Returns 0 for success or negative for error
1583  *
1584  * Utility for the outgoing postcopy code.
1585  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1586  *   passing it bitmap indexes and name.
1587  * (qemu_ram_foreach_block ends up passing unscaled lengths
1588  *  which would mean postcopy code would have to deal with target page)
1589  *
1590  * @ms: current migration state
1591  */
1592 static int postcopy_each_ram_send_discard(MigrationState *ms)
1593 {
1594     struct RAMBlock *block;
1595     int ret;
1596
1597     RAMBLOCK_FOREACH(block) {
1598         PostcopyDiscardState *pds =
1599             postcopy_discard_send_init(ms, block->idstr);
1600
1601         /*
1602          * Postcopy sends chunks of bitmap over the wire, but it
1603          * just needs indexes at this point, avoids it having
1604          * target page specific code.
1605          */
1606         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1607         postcopy_discard_send_finish(ms, pds);
1608         if (ret) {
1609             return ret;
1610         }
1611     }
1612
1613     return 0;
1614 }
1615
1616 /**
1617  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1618  *
1619  * Helper for postcopy_chunk_hostpages; it's called twice to
1620  * canonicalize the two bitmaps, that are similar, but one is
1621  * inverted.
1622  *
1623  * Postcopy requires that all target pages in a hostpage are dirty or
1624  * clean, not a mix.  This function canonicalizes the bitmaps.
1625  *
1626  * @ms: current migration state
1627  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1628  *               otherwise we need to canonicalize partially dirty host pages
1629  * @block: block that contains the page we want to canonicalize
1630  * @pds: state for postcopy
1631  */
1632 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1633                                           RAMBlock *block,
1634                                           PostcopyDiscardState *pds)
1635 {
1636     RAMState *rs = &ram_state;
1637     unsigned long *bitmap = block->bmap;
1638     unsigned long *unsentmap = block->unsentmap;
1639     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1640     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1641     unsigned long run_start;
1642
1643     if (block->page_size == TARGET_PAGE_SIZE) {
1644         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1645         return;
1646     }
1647
1648     if (unsent_pass) {
1649         /* Find a sent page */
1650         run_start = find_next_zero_bit(unsentmap, pages, 0);
1651     } else {
1652         /* Find a dirty page */
1653         run_start = find_next_bit(bitmap, pages, 0);
1654     }
1655
1656     while (run_start < pages) {
1657         bool do_fixup = false;
1658         unsigned long fixup_start_addr;
1659         unsigned long host_offset;
1660
1661         /*
1662          * If the start of this run of pages is in the middle of a host
1663          * page, then we need to fixup this host page.
1664          */
1665         host_offset = run_start % host_ratio;
1666         if (host_offset) {
1667             do_fixup = true;
1668             run_start -= host_offset;
1669             fixup_start_addr = run_start;
1670             /* For the next pass */
1671             run_start = run_start + host_ratio;
1672         } else {
1673             /* Find the end of this run */
1674             unsigned long run_end;
1675             if (unsent_pass) {
1676                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1677             } else {
1678                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1679             }
1680             /*
1681              * If the end isn't at the start of a host page, then the
1682              * run doesn't finish at the end of a host page
1683              * and we need to discard.
1684              */
1685             host_offset = run_end % host_ratio;
1686             if (host_offset) {
1687                 do_fixup = true;
1688                 fixup_start_addr = run_end - host_offset;
1689                 /*
1690                  * This host page has gone, the next loop iteration starts
1691                  * from after the fixup
1692                  */
1693                 run_start = fixup_start_addr + host_ratio;
1694             } else {
1695                 /*
1696                  * No discards on this iteration, next loop starts from
1697                  * next sent/dirty page
1698                  */
1699                 run_start = run_end + 1;
1700             }
1701         }
1702
1703         if (do_fixup) {
1704             unsigned long page;
1705
1706             /* Tell the destination to discard this page */
1707             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1708                 /* For the unsent_pass we:
1709                  *     discard partially sent pages
1710                  * For the !unsent_pass (dirty) we:
1711                  *     discard partially dirty pages that were sent
1712                  *     (any partially sent pages were already discarded
1713                  *     by the previous unsent_pass)
1714                  */
1715                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1716                                             host_ratio);
1717             }
1718
1719             /* Clean up the bitmap */
1720             for (page = fixup_start_addr;
1721                  page < fixup_start_addr + host_ratio; page++) {
1722                 /* All pages in this host page are now not sent */
1723                 set_bit(page, unsentmap);
1724
1725                 /*
1726                  * Remark them as dirty, updating the count for any pages
1727                  * that weren't previously dirty.
1728                  */
1729                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1730             }
1731         }
1732
1733         if (unsent_pass) {
1734             /* Find the next sent page for the next iteration */
1735             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1736         } else {
1737             /* Find the next dirty page for the next iteration */
1738             run_start = find_next_bit(bitmap, pages, run_start);
1739         }
1740     }
1741 }
1742
1743 /**
1744  * postcopy_chuck_hostpages: discrad any partially sent host page
1745  *
1746  * Utility for the outgoing postcopy code.
1747  *
1748  * Discard any partially sent host-page size chunks, mark any partially
1749  * dirty host-page size chunks as all dirty.  In this case the host-page
1750  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1751  *
1752  * Returns zero on success
1753  *
1754  * @ms: current migration state
1755  * @block: block we want to work with
1756  */
1757 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1758 {
1759     PostcopyDiscardState *pds =
1760         postcopy_discard_send_init(ms, block->idstr);
1761
1762     /* First pass: Discard all partially sent host pages */
1763     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1764     /*
1765      * Second pass: Ensure that all partially dirty host pages are made
1766      * fully dirty.
1767      */
1768     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1769
1770     postcopy_discard_send_finish(ms, pds);
1771     return 0;
1772 }
1773
1774 /**
1775  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1776  *
1777  * Returns zero on success
1778  *
1779  * Transmit the set of pages to be discarded after precopy to the target
1780  * these are pages that:
1781  *     a) Have been previously transmitted but are now dirty again
1782  *     b) Pages that have never been transmitted, this ensures that
1783  *        any pages on the destination that have been mapped by background
1784  *        tasks get discarded (transparent huge pages is the specific concern)
1785  * Hopefully this is pretty sparse
1786  *
1787  * @ms: current migration state
1788  */
1789 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1790 {
1791     RAMState *rs = &ram_state;
1792     RAMBlock *block;
1793     int ret;
1794
1795     rcu_read_lock();
1796
1797     /* This should be our last sync, the src is now paused */
1798     migration_bitmap_sync(rs);
1799
1800     /* Easiest way to make sure we don't resume in the middle of a host-page */
1801     rs->last_seen_block = NULL;
1802     rs->last_sent_block = NULL;
1803     rs->last_page = 0;
1804
1805     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1806         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1807         unsigned long *bitmap = block->bmap;
1808         unsigned long *unsentmap = block->unsentmap;
1809
1810         if (!unsentmap) {
1811             /* We don't have a safe way to resize the sentmap, so
1812              * if the bitmap was resized it will be NULL at this
1813              * point.
1814              */
1815             error_report("migration ram resized during precopy phase");
1816             rcu_read_unlock();
1817             return -EINVAL;
1818         }
1819         /* Deal with TPS != HPS and huge pages */
1820         ret = postcopy_chunk_hostpages(ms, block);
1821         if (ret) {
1822             rcu_read_unlock();
1823             return ret;
1824         }
1825
1826         /*
1827          * Update the unsentmap to be unsentmap = unsentmap | dirty
1828          */
1829         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1830 #ifdef DEBUG_POSTCOPY
1831         ram_debug_dump_bitmap(unsentmap, true, pages);
1832 #endif
1833     }
1834     trace_ram_postcopy_send_discard_bitmap();
1835
1836     ret = postcopy_each_ram_send_discard(ms);
1837     rcu_read_unlock();
1838
1839     return ret;
1840 }
1841
1842 /**
1843  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1844  *
1845  * Returns zero on success
1846  *
1847  * @rbname: name of the RAMBlock of the request. NULL means the
1848  *          same that last one.
1849  * @start: RAMBlock starting page
1850  * @length: RAMBlock size
1851  */
1852 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1853 {
1854     int ret = -1;
1855
1856     trace_ram_discard_range(rbname, start, length);
1857
1858     rcu_read_lock();
1859     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1860
1861     if (!rb) {
1862         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1863         goto err;
1864     }
1865
1866     ret = ram_block_discard_range(rb, start, length);
1867
1868 err:
1869     rcu_read_unlock();
1870
1871     return ret;
1872 }
1873
1874 static int ram_state_init(RAMState *rs)
1875 {
1876     memset(rs, 0, sizeof(*rs));
1877     qemu_mutex_init(&rs->bitmap_mutex);
1878     qemu_mutex_init(&rs->src_page_req_mutex);
1879     QSIMPLEQ_INIT(&rs->src_page_requests);
1880
1881     if (migrate_use_xbzrle()) {
1882         XBZRLE_cache_lock();
1883         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1884         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1885                                   TARGET_PAGE_SIZE,
1886                                   TARGET_PAGE_SIZE);
1887         if (!XBZRLE.cache) {
1888             XBZRLE_cache_unlock();
1889             error_report("Error creating cache");
1890             return -1;
1891         }
1892         XBZRLE_cache_unlock();
1893
1894         /* We prefer not to abort if there is no memory */
1895         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1896         if (!XBZRLE.encoded_buf) {
1897             error_report("Error allocating encoded_buf");
1898             return -1;
1899         }
1900
1901         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1902         if (!XBZRLE.current_buf) {
1903             error_report("Error allocating current_buf");
1904             g_free(XBZRLE.encoded_buf);
1905             XBZRLE.encoded_buf = NULL;
1906             return -1;
1907         }
1908     }
1909
1910     /* For memory_global_dirty_log_start below.  */
1911     qemu_mutex_lock_iothread();
1912
1913     qemu_mutex_lock_ramlist();
1914     rcu_read_lock();
1915     ram_state_reset(rs);
1916
1917     /* Skip setting bitmap if there is no RAM */
1918     if (ram_bytes_total()) {
1919         RAMBlock *block;
1920
1921         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1922             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1923
1924             block->bmap = bitmap_new(pages);
1925             bitmap_set(block->bmap, 0, pages);
1926             if (migrate_postcopy_ram()) {
1927                 block->unsentmap = bitmap_new(pages);
1928                 bitmap_set(block->unsentmap, 0, pages);
1929             }
1930         }
1931     }
1932
1933     /*
1934      * Count the total number of pages used by ram blocks not including any
1935      * gaps due to alignment or unplugs.
1936      */
1937     rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1938
1939     memory_global_dirty_log_start();
1940     migration_bitmap_sync(rs);
1941     qemu_mutex_unlock_ramlist();
1942     qemu_mutex_unlock_iothread();
1943     rcu_read_unlock();
1944
1945     return 0;
1946 }
1947
1948 /*
1949  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1950  * long-running RCU critical section.  When rcu-reclaims in the code
1951  * start to become numerous it will be necessary to reduce the
1952  * granularity of these critical sections.
1953  */
1954
1955 /**
1956  * ram_save_setup: Setup RAM for migration
1957  *
1958  * Returns zero to indicate success and negative for error
1959  *
1960  * @f: QEMUFile where to send the data
1961  * @opaque: RAMState pointer
1962  */
1963 static int ram_save_setup(QEMUFile *f, void *opaque)
1964 {
1965     RAMState *rs = opaque;
1966     RAMBlock *block;
1967
1968     /* migration has already setup the bitmap, reuse it. */
1969     if (!migration_in_colo_state()) {
1970         if (ram_state_init(rs) < 0) {
1971             return -1;
1972          }
1973     }
1974     rs->f = f;
1975
1976     rcu_read_lock();
1977
1978     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1979
1980     RAMBLOCK_FOREACH(block) {
1981         qemu_put_byte(f, strlen(block->idstr));
1982         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1983         qemu_put_be64(f, block->used_length);
1984         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1985             qemu_put_be64(f, block->page_size);
1986         }
1987     }
1988
1989     rcu_read_unlock();
1990
1991     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1992     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1993
1994     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1995
1996     return 0;
1997 }
1998
1999 /**
2000  * ram_save_iterate: iterative stage for migration
2001  *
2002  * Returns zero to indicate success and negative for error
2003  *
2004  * @f: QEMUFile where to send the data
2005  * @opaque: RAMState pointer
2006  */
2007 static int ram_save_iterate(QEMUFile *f, void *opaque)
2008 {
2009     RAMState *rs = opaque;
2010     int ret;
2011     int i;
2012     int64_t t0;
2013     int done = 0;
2014
2015     rcu_read_lock();
2016     if (ram_list.version != rs->last_version) {
2017         ram_state_reset(rs);
2018     }
2019
2020     /* Read version before ram_list.blocks */
2021     smp_rmb();
2022
2023     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2024
2025     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2026     i = 0;
2027     while ((ret = qemu_file_rate_limit(f)) == 0) {
2028         int pages;
2029
2030         pages = ram_find_and_save_block(rs, false);
2031         /* no more pages to sent */
2032         if (pages == 0) {
2033             done = 1;
2034             break;
2035         }
2036         rs->iterations++;
2037
2038         /* we want to check in the 1st loop, just in case it was the 1st time
2039            and we had to sync the dirty bitmap.
2040            qemu_get_clock_ns() is a bit expensive, so we only check each some
2041            iterations
2042         */
2043         if ((i & 63) == 0) {
2044             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2045             if (t1 > MAX_WAIT) {
2046                 trace_ram_save_iterate_big_wait(t1, i);
2047                 break;
2048             }
2049         }
2050         i++;
2051     }
2052     flush_compressed_data(rs);
2053     rcu_read_unlock();
2054
2055     /*
2056      * Must occur before EOS (or any QEMUFile operation)
2057      * because of RDMA protocol.
2058      */
2059     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2060
2061     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2062     rs->bytes_transferred += 8;
2063
2064     ret = qemu_file_get_error(f);
2065     if (ret < 0) {
2066         return ret;
2067     }
2068
2069     return done;
2070 }
2071
2072 /**
2073  * ram_save_complete: function called to send the remaining amount of ram
2074  *
2075  * Returns zero to indicate success
2076  *
2077  * Called with iothread lock
2078  *
2079  * @f: QEMUFile where to send the data
2080  * @opaque: RAMState pointer
2081  */
2082 static int ram_save_complete(QEMUFile *f, void *opaque)
2083 {
2084     RAMState *rs = opaque;
2085
2086     rcu_read_lock();
2087
2088     if (!migration_in_postcopy()) {
2089         migration_bitmap_sync(rs);
2090     }
2091
2092     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2093
2094     /* try transferring iterative blocks of memory */
2095
2096     /* flush all remaining blocks regardless of rate limiting */
2097     while (true) {
2098         int pages;
2099
2100         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2101         /* no more blocks to sent */
2102         if (pages == 0) {
2103             break;
2104         }
2105     }
2106
2107     flush_compressed_data(rs);
2108     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2109
2110     rcu_read_unlock();
2111
2112     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2113
2114     return 0;
2115 }
2116
2117 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2118                              uint64_t *non_postcopiable_pending,
2119                              uint64_t *postcopiable_pending)
2120 {
2121     RAMState *rs = opaque;
2122     uint64_t remaining_size;
2123
2124     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2125
2126     if (!migration_in_postcopy() &&
2127         remaining_size < max_size) {
2128         qemu_mutex_lock_iothread();
2129         rcu_read_lock();
2130         migration_bitmap_sync(rs);
2131         rcu_read_unlock();
2132         qemu_mutex_unlock_iothread();
2133         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2134     }
2135
2136     /* We can do postcopy, and all the data is postcopiable */
2137     *postcopiable_pending += remaining_size;
2138 }
2139
2140 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2141 {
2142     unsigned int xh_len;
2143     int xh_flags;
2144     uint8_t *loaded_data;
2145
2146     if (!xbzrle_decoded_buf) {
2147         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2148     }
2149     loaded_data = xbzrle_decoded_buf;
2150
2151     /* extract RLE header */
2152     xh_flags = qemu_get_byte(f);
2153     xh_len = qemu_get_be16(f);
2154
2155     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2156         error_report("Failed to load XBZRLE page - wrong compression!");
2157         return -1;
2158     }
2159
2160     if (xh_len > TARGET_PAGE_SIZE) {
2161         error_report("Failed to load XBZRLE page - len overflow!");
2162         return -1;
2163     }
2164     /* load data and decode */
2165     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2166
2167     /* decode RLE */
2168     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2169                              TARGET_PAGE_SIZE) == -1) {
2170         error_report("Failed to load XBZRLE page - decode error!");
2171         return -1;
2172     }
2173
2174     return 0;
2175 }
2176
2177 /**
2178  * ram_block_from_stream: read a RAMBlock id from the migration stream
2179  *
2180  * Must be called from within a rcu critical section.
2181  *
2182  * Returns a pointer from within the RCU-protected ram_list.
2183  *
2184  * @f: QEMUFile where to read the data from
2185  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2186  */
2187 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2188 {
2189     static RAMBlock *block = NULL;
2190     char id[256];
2191     uint8_t len;
2192
2193     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2194         if (!block) {
2195             error_report("Ack, bad migration stream!");
2196             return NULL;
2197         }
2198         return block;
2199     }
2200
2201     len = qemu_get_byte(f);
2202     qemu_get_buffer(f, (uint8_t *)id, len);
2203     id[len] = 0;
2204
2205     block = qemu_ram_block_by_name(id);
2206     if (!block) {
2207         error_report("Can't find block %s", id);
2208         return NULL;
2209     }
2210
2211     return block;
2212 }
2213
2214 static inline void *host_from_ram_block_offset(RAMBlock *block,
2215                                                ram_addr_t offset)
2216 {
2217     if (!offset_in_ramblock(block, offset)) {
2218         return NULL;
2219     }
2220
2221     return block->host + offset;
2222 }
2223
2224 /**
2225  * ram_handle_compressed: handle the zero page case
2226  *
2227  * If a page (or a whole RDMA chunk) has been
2228  * determined to be zero, then zap it.
2229  *
2230  * @host: host address for the zero page
2231  * @ch: what the page is filled from.  We only support zero
2232  * @size: size of the zero page
2233  */
2234 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2235 {
2236     if (ch != 0 || !is_zero_range(host, size)) {
2237         memset(host, ch, size);
2238     }
2239 }
2240
2241 static void *do_data_decompress(void *opaque)
2242 {
2243     DecompressParam *param = opaque;
2244     unsigned long pagesize;
2245     uint8_t *des;
2246     int len;
2247
2248     qemu_mutex_lock(&param->mutex);
2249     while (!param->quit) {
2250         if (param->des) {
2251             des = param->des;
2252             len = param->len;
2253             param->des = 0;
2254             qemu_mutex_unlock(&param->mutex);
2255
2256             pagesize = TARGET_PAGE_SIZE;
2257             /* uncompress() will return failed in some case, especially
2258              * when the page is dirted when doing the compression, it's
2259              * not a problem because the dirty page will be retransferred
2260              * and uncompress() won't break the data in other pages.
2261              */
2262             uncompress((Bytef *)des, &pagesize,
2263                        (const Bytef *)param->compbuf, len);
2264
2265             qemu_mutex_lock(&decomp_done_lock);
2266             param->done = true;
2267             qemu_cond_signal(&decomp_done_cond);
2268             qemu_mutex_unlock(&decomp_done_lock);
2269
2270             qemu_mutex_lock(&param->mutex);
2271         } else {
2272             qemu_cond_wait(&param->cond, &param->mutex);
2273         }
2274     }
2275     qemu_mutex_unlock(&param->mutex);
2276
2277     return NULL;
2278 }
2279
2280 static void wait_for_decompress_done(void)
2281 {
2282     int idx, thread_count;
2283
2284     if (!migrate_use_compression()) {
2285         return;
2286     }
2287
2288     thread_count = migrate_decompress_threads();
2289     qemu_mutex_lock(&decomp_done_lock);
2290     for (idx = 0; idx < thread_count; idx++) {
2291         while (!decomp_param[idx].done) {
2292             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2293         }
2294     }
2295     qemu_mutex_unlock(&decomp_done_lock);
2296 }
2297
2298 void migrate_decompress_threads_create(void)
2299 {
2300     int i, thread_count;
2301
2302     thread_count = migrate_decompress_threads();
2303     decompress_threads = g_new0(QemuThread, thread_count);
2304     decomp_param = g_new0(DecompressParam, thread_count);
2305     qemu_mutex_init(&decomp_done_lock);
2306     qemu_cond_init(&decomp_done_cond);
2307     for (i = 0; i < thread_count; i++) {
2308         qemu_mutex_init(&decomp_param[i].mutex);
2309         qemu_cond_init(&decomp_param[i].cond);
2310         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2311         decomp_param[i].done = true;
2312         decomp_param[i].quit = false;
2313         qemu_thread_create(decompress_threads + i, "decompress",
2314                            do_data_decompress, decomp_param + i,
2315                            QEMU_THREAD_JOINABLE);
2316     }
2317 }
2318
2319 void migrate_decompress_threads_join(void)
2320 {
2321     int i, thread_count;
2322
2323     thread_count = migrate_decompress_threads();
2324     for (i = 0; i < thread_count; i++) {
2325         qemu_mutex_lock(&decomp_param[i].mutex);
2326         decomp_param[i].quit = true;
2327         qemu_cond_signal(&decomp_param[i].cond);
2328         qemu_mutex_unlock(&decomp_param[i].mutex);
2329     }
2330     for (i = 0; i < thread_count; i++) {
2331         qemu_thread_join(decompress_threads + i);
2332         qemu_mutex_destroy(&decomp_param[i].mutex);
2333         qemu_cond_destroy(&decomp_param[i].cond);
2334         g_free(decomp_param[i].compbuf);
2335     }
2336     g_free(decompress_threads);
2337     g_free(decomp_param);
2338     decompress_threads = NULL;
2339     decomp_param = NULL;
2340 }
2341
2342 static void decompress_data_with_multi_threads(QEMUFile *f,
2343                                                void *host, int len)
2344 {
2345     int idx, thread_count;
2346
2347     thread_count = migrate_decompress_threads();
2348     qemu_mutex_lock(&decomp_done_lock);
2349     while (true) {
2350         for (idx = 0; idx < thread_count; idx++) {
2351             if (decomp_param[idx].done) {
2352                 decomp_param[idx].done = false;
2353                 qemu_mutex_lock(&decomp_param[idx].mutex);
2354                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2355                 decomp_param[idx].des = host;
2356                 decomp_param[idx].len = len;
2357                 qemu_cond_signal(&decomp_param[idx].cond);
2358                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2359                 break;
2360             }
2361         }
2362         if (idx < thread_count) {
2363             break;
2364         } else {
2365             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2366         }
2367     }
2368     qemu_mutex_unlock(&decomp_done_lock);
2369 }
2370
2371 /**
2372  * ram_postcopy_incoming_init: allocate postcopy data structures
2373  *
2374  * Returns 0 for success and negative if there was one error
2375  *
2376  * @mis: current migration incoming state
2377  *
2378  * Allocate data structures etc needed by incoming migration with
2379  * postcopy-ram. postcopy-ram's similarly names
2380  * postcopy_ram_incoming_init does the work.
2381  */
2382 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2383 {
2384     unsigned long ram_pages = last_ram_page();
2385
2386     return postcopy_ram_incoming_init(mis, ram_pages);
2387 }
2388
2389 /**
2390  * ram_load_postcopy: load a page in postcopy case
2391  *
2392  * Returns 0 for success or -errno in case of error
2393  *
2394  * Called in postcopy mode by ram_load().
2395  * rcu_read_lock is taken prior to this being called.
2396  *
2397  * @f: QEMUFile where to send the data
2398  */
2399 static int ram_load_postcopy(QEMUFile *f)
2400 {
2401     int flags = 0, ret = 0;
2402     bool place_needed = false;
2403     bool matching_page_sizes = false;
2404     MigrationIncomingState *mis = migration_incoming_get_current();
2405     /* Temporary page that is later 'placed' */
2406     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2407     void *last_host = NULL;
2408     bool all_zero = false;
2409
2410     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2411         ram_addr_t addr;
2412         void *host = NULL;
2413         void *page_buffer = NULL;
2414         void *place_source = NULL;
2415         RAMBlock *block = NULL;
2416         uint8_t ch;
2417
2418         addr = qemu_get_be64(f);
2419         flags = addr & ~TARGET_PAGE_MASK;
2420         addr &= TARGET_PAGE_MASK;
2421
2422         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2423         place_needed = false;
2424         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2425             block = ram_block_from_stream(f, flags);
2426
2427             host = host_from_ram_block_offset(block, addr);
2428             if (!host) {
2429                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2430                 ret = -EINVAL;
2431                 break;
2432             }
2433             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2434             /*
2435              * Postcopy requires that we place whole host pages atomically;
2436              * these may be huge pages for RAMBlocks that are backed by
2437              * hugetlbfs.
2438              * To make it atomic, the data is read into a temporary page
2439              * that's moved into place later.
2440              * The migration protocol uses,  possibly smaller, target-pages
2441              * however the source ensures it always sends all the components
2442              * of a host page in order.
2443              */
2444             page_buffer = postcopy_host_page +
2445                           ((uintptr_t)host & (block->page_size - 1));
2446             /* If all TP are zero then we can optimise the place */
2447             if (!((uintptr_t)host & (block->page_size - 1))) {
2448                 all_zero = true;
2449             } else {
2450                 /* not the 1st TP within the HP */
2451                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2452                     error_report("Non-sequential target page %p/%p",
2453                                   host, last_host);
2454                     ret = -EINVAL;
2455                     break;
2456                 }
2457             }
2458
2459
2460             /*
2461              * If it's the last part of a host page then we place the host
2462              * page
2463              */
2464             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2465                                      (block->page_size - 1)) == 0;
2466             place_source = postcopy_host_page;
2467         }
2468         last_host = host;
2469
2470         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2471         case RAM_SAVE_FLAG_ZERO:
2472             ch = qemu_get_byte(f);
2473             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2474             if (ch) {
2475                 all_zero = false;
2476             }
2477             break;
2478
2479         case RAM_SAVE_FLAG_PAGE:
2480             all_zero = false;
2481             if (!place_needed || !matching_page_sizes) {
2482                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2483             } else {
2484                 /* Avoids the qemu_file copy during postcopy, which is
2485                  * going to do a copy later; can only do it when we
2486                  * do this read in one go (matching page sizes)
2487                  */
2488                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2489                                          TARGET_PAGE_SIZE);
2490             }
2491             break;
2492         case RAM_SAVE_FLAG_EOS:
2493             /* normal exit */
2494             break;
2495         default:
2496             error_report("Unknown combination of migration flags: %#x"
2497                          " (postcopy mode)", flags);
2498             ret = -EINVAL;
2499         }
2500
2501         if (place_needed) {
2502             /* This gets called at the last target page in the host page */
2503             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2504
2505             if (all_zero) {
2506                 ret = postcopy_place_page_zero(mis, place_dest,
2507                                                block->page_size);
2508             } else {
2509                 ret = postcopy_place_page(mis, place_dest,
2510                                           place_source, block->page_size);
2511             }
2512         }
2513         if (!ret) {
2514             ret = qemu_file_get_error(f);
2515         }
2516     }
2517
2518     return ret;
2519 }
2520
2521 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2522 {
2523     int flags = 0, ret = 0;
2524     static uint64_t seq_iter;
2525     int len = 0;
2526     /*
2527      * If system is running in postcopy mode, page inserts to host memory must
2528      * be atomic
2529      */
2530     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2531     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2532     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2533
2534     seq_iter++;
2535
2536     if (version_id != 4) {
2537         ret = -EINVAL;
2538     }
2539
2540     /* This RCU critical section can be very long running.
2541      * When RCU reclaims in the code start to become numerous,
2542      * it will be necessary to reduce the granularity of this
2543      * critical section.
2544      */
2545     rcu_read_lock();
2546
2547     if (postcopy_running) {
2548         ret = ram_load_postcopy(f);
2549     }
2550
2551     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2552         ram_addr_t addr, total_ram_bytes;
2553         void *host = NULL;
2554         uint8_t ch;
2555
2556         addr = qemu_get_be64(f);
2557         flags = addr & ~TARGET_PAGE_MASK;
2558         addr &= TARGET_PAGE_MASK;
2559
2560         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2561                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2562             RAMBlock *block = ram_block_from_stream(f, flags);
2563
2564             host = host_from_ram_block_offset(block, addr);
2565             if (!host) {
2566                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2567                 ret = -EINVAL;
2568                 break;
2569             }
2570             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2571         }
2572
2573         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2574         case RAM_SAVE_FLAG_MEM_SIZE:
2575             /* Synchronize RAM block list */
2576             total_ram_bytes = addr;
2577             while (!ret && total_ram_bytes) {
2578                 RAMBlock *block;
2579                 char id[256];
2580                 ram_addr_t length;
2581
2582                 len = qemu_get_byte(f);
2583                 qemu_get_buffer(f, (uint8_t *)id, len);
2584                 id[len] = 0;
2585                 length = qemu_get_be64(f);
2586
2587                 block = qemu_ram_block_by_name(id);
2588                 if (block) {
2589                     if (length != block->used_length) {
2590                         Error *local_err = NULL;
2591
2592                         ret = qemu_ram_resize(block, length,
2593                                               &local_err);
2594                         if (local_err) {
2595                             error_report_err(local_err);
2596                         }
2597                     }
2598                     /* For postcopy we need to check hugepage sizes match */
2599                     if (postcopy_advised &&
2600                         block->page_size != qemu_host_page_size) {
2601                         uint64_t remote_page_size = qemu_get_be64(f);
2602                         if (remote_page_size != block->page_size) {
2603                             error_report("Mismatched RAM page size %s "
2604                                          "(local) %zd != %" PRId64,
2605                                          id, block->page_size,
2606                                          remote_page_size);
2607                             ret = -EINVAL;
2608                         }
2609                     }
2610                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2611                                           block->idstr);
2612                 } else {
2613                     error_report("Unknown ramblock \"%s\", cannot "
2614                                  "accept migration", id);
2615                     ret = -EINVAL;
2616                 }
2617
2618                 total_ram_bytes -= length;
2619             }
2620             break;
2621
2622         case RAM_SAVE_FLAG_ZERO:
2623             ch = qemu_get_byte(f);
2624             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2625             break;
2626
2627         case RAM_SAVE_FLAG_PAGE:
2628             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2629             break;
2630
2631         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2632             len = qemu_get_be32(f);
2633             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2634                 error_report("Invalid compressed data length: %d", len);
2635                 ret = -EINVAL;
2636                 break;
2637             }
2638             decompress_data_with_multi_threads(f, host, len);
2639             break;
2640
2641         case RAM_SAVE_FLAG_XBZRLE:
2642             if (load_xbzrle(f, addr, host) < 0) {
2643                 error_report("Failed to decompress XBZRLE page at "
2644                              RAM_ADDR_FMT, addr);
2645                 ret = -EINVAL;
2646                 break;
2647             }
2648             break;
2649         case RAM_SAVE_FLAG_EOS:
2650             /* normal exit */
2651             break;
2652         default:
2653             if (flags & RAM_SAVE_FLAG_HOOK) {
2654                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2655             } else {
2656                 error_report("Unknown combination of migration flags: %#x",
2657                              flags);
2658                 ret = -EINVAL;
2659             }
2660         }
2661         if (!ret) {
2662             ret = qemu_file_get_error(f);
2663         }
2664     }
2665
2666     wait_for_decompress_done();
2667     rcu_read_unlock();
2668     trace_ram_load_complete(ret, seq_iter);
2669     return ret;
2670 }
2671
2672 static SaveVMHandlers savevm_ram_handlers = {
2673     .save_live_setup = ram_save_setup,
2674     .save_live_iterate = ram_save_iterate,
2675     .save_live_complete_postcopy = ram_save_complete,
2676     .save_live_complete_precopy = ram_save_complete,
2677     .save_live_pending = ram_save_pending,
2678     .load_state = ram_load,
2679     .cleanup = ram_migration_cleanup,
2680 };
2681
2682 void ram_mig_init(void)
2683 {
2684     qemu_mutex_init(&XBZRLE.lock);
2685     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2686 }