migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <[email protected]>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "xbzrle.h"
  39 #include "migration/migration.h"
  40 #include "migration/qemu-file.h"
  41 #include "migration/vmstate.h"
  42 #include "postcopy-ram.h"
  43 #include "exec/address-spaces.h"
  44 #include "migration/page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "trace.h"
  47 #include "exec/ram_addr.h"
  48 #include "qemu/rcu_queue.h"
  49 #include "migration/colo.h"
  50
  51 /***********************************************************/
  52 /* ram save/restore */
  53
  54 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  55  * worked for pages that where filled with the same char.  We switched
  56  * it to only search for the zero value.  And to avoid confusion with
  57  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  58  */
  59
  60 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  61 #define RAM_SAVE_FLAG_ZERO     0x02
  62 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  63 #define RAM_SAVE_FLAG_PAGE     0x08
  64 #define RAM_SAVE_FLAG_EOS      0x10
  65 #define RAM_SAVE_FLAG_CONTINUE 0x20
  66 #define RAM_SAVE_FLAG_XBZRLE   0x40
  67 /* 0x80 is reserved in migration.h start with 0x100 next */
  68 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  69
  70 static uint8_t *ZERO_TARGET_PAGE;
  71
  72 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  73 {
  74     return buffer_is_zero(p, size);
  75 }
  76
  77 /* struct contains XBZRLE cache and a static page
  78    used by the compression */
  79 static struct {
  80     /* buffer used for XBZRLE encoding */
  81     uint8_t *encoded_buf;
  82     /* buffer for storing page content */
  83     uint8_t *current_buf;
  84     /* Cache for XBZRLE, Protected by lock. */
  85     PageCache *cache;
  86     QemuMutex lock;
  87 } XBZRLE;
  88
  89 /* buffer used for XBZRLE decoding */
  90 static uint8_t *xbzrle_decoded_buf;
  91
  92 static void XBZRLE_cache_lock(void)
  93 {
  94     if (migrate_use_xbzrle())
  95         qemu_mutex_lock(&XBZRLE.lock);
  96 }
  97
  98 static void XBZRLE_cache_unlock(void)
  99 {
 100     if (migrate_use_xbzrle())
 101         qemu_mutex_unlock(&XBZRLE.lock);
 102 }
 103
 104 /**
 105  * xbzrle_cache_resize: resize the xbzrle cache
 106  *
 107  * This function is called from qmp_migrate_set_cache_size in main
 108  * thread, possibly while a migration is in progress.  A running
 109  * migration may be using the cache and might finish during this call,
 110  * hence changes to the cache are protected by XBZRLE.lock().
 111  *
 112  * Returns the new_size or negative in case of error.
 113  *
 114  * @new_size: new cache size
 115  */
 116 int64_t xbzrle_cache_resize(int64_t new_size)
 117 {
 118     PageCache *new_cache;
 119     int64_t ret;
 120
 121     if (new_size < TARGET_PAGE_SIZE) {
 122         return -1;
 123     }
 124
 125     XBZRLE_cache_lock();
 126
 127     if (XBZRLE.cache != NULL) {
 128         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 129             goto out_new_size;
 130         }
 131         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 132                                         TARGET_PAGE_SIZE);
 133         if (!new_cache) {
 134             error_report("Error creating cache");
 135             ret = -1;
 136             goto out;
 137         }
 138
 139         cache_fini(XBZRLE.cache);
 140         XBZRLE.cache = new_cache;
 141     }
 142
 143 out_new_size:
 144     ret = pow2floor(new_size);
 145 out:
 146     XBZRLE_cache_unlock();
 147     return ret;
 148 }
 149
 150 /*
 151  * An outstanding page request, on the source, having been received
 152  * and queued
 153  */
 154 struct RAMSrcPageRequest {
 155     RAMBlock *rb;
 156     hwaddr    offset;
 157     hwaddr    len;
 158
 159     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 160 };
 161
 162 /* State of RAM for migration */
 163 struct RAMState {
 164     /* QEMUFile used for this migration */
 165     QEMUFile *f;
 166     /* Last block that we have visited searching for dirty pages */
 167     RAMBlock *last_seen_block;
 168     /* Last block from where we have sent data */
 169     RAMBlock *last_sent_block;
 170     /* Last dirty target page we have sent */
 171     ram_addr_t last_page;
 172     /* last ram version we have seen */
 173     uint32_t last_version;
 174     /* We are in the first round */
 175     bool ram_bulk_stage;
 176     /* How many times we have dirty too many pages */
 177     int dirty_rate_high_cnt;
 178     /* How many times we have synchronized the bitmap */
 179     uint64_t bitmap_sync_count;
 180     /* these variables are used for bitmap sync */
 181     /* last time we did a full bitmap_sync */
 182     int64_t time_last_bitmap_sync;
 183     /* bytes transferred at start_time */
 184     uint64_t bytes_xfer_prev;
 185     /* number of dirty pages since start_time */
 186     uint64_t num_dirty_pages_period;
 187     /* xbzrle misses since the beginning of the period */
 188     uint64_t xbzrle_cache_miss_prev;
 189     /* number of iterations at the beginning of period */
 190     uint64_t iterations_prev;
 191     /* Accounting fields */
 192     /* number of zero pages.  It used to be pages filled by the same char. */
 193     uint64_t zero_pages;
 194     /* number of normal transferred pages */
 195     uint64_t norm_pages;
 196     /* Iterations since start */
 197     uint64_t iterations;
 198     /* xbzrle transmitted bytes.  Notice that this is with
 199      * compression, they can't be calculated from the pages */
 200     uint64_t xbzrle_bytes;
 201     /* xbzrle transmmited pages */
 202     uint64_t xbzrle_pages;
 203     /* xbzrle number of cache miss */
 204     uint64_t xbzrle_cache_miss;
 205     /* xbzrle miss rate */
 206     double xbzrle_cache_miss_rate;
 207     /* xbzrle number of overflows */
 208     uint64_t xbzrle_overflows;
 209     /* number of dirty bits in the bitmap */
 210     uint64_t migration_dirty_pages;
 211     /* total number of bytes transferred */
 212     uint64_t bytes_transferred;
 213     /* number of dirtied pages in the last second */
 214     uint64_t dirty_pages_rate;
 215     /* Count of requests incoming from destination */
 216     uint64_t postcopy_requests;
 217     /* protects modification of the bitmap */
 218     QemuMutex bitmap_mutex;
 219     /* The RAMBlock used in the last src_page_requests */
 220     RAMBlock *last_req_rb;
 221     /* Queue of outstanding page requests from the destination */
 222     QemuMutex src_page_req_mutex;
 223     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 224 };
 225 typedef struct RAMState RAMState;
 226
 227 static RAMState ram_state;
 228
 229 uint64_t dup_mig_pages_transferred(void)
 230 {
 231     return ram_state.zero_pages;
 232 }
 233
 234 uint64_t norm_mig_pages_transferred(void)
 235 {
 236     return ram_state.norm_pages;
 237 }
 238
 239 uint64_t xbzrle_mig_bytes_transferred(void)
 240 {
 241     return ram_state.xbzrle_bytes;
 242 }
 243
 244 uint64_t xbzrle_mig_pages_transferred(void)
 245 {
 246     return ram_state.xbzrle_pages;
 247 }
 248
 249 uint64_t xbzrle_mig_pages_cache_miss(void)
 250 {
 251     return ram_state.xbzrle_cache_miss;
 252 }
 253
 254 double xbzrle_mig_cache_miss_rate(void)
 255 {
 256     return ram_state.xbzrle_cache_miss_rate;
 257 }
 258
 259 uint64_t xbzrle_mig_pages_overflow(void)
 260 {
 261     return ram_state.xbzrle_overflows;
 262 }
 263
 264 uint64_t ram_bytes_transferred(void)
 265 {
 266     return ram_state.bytes_transferred;
 267 }
 268
 269 uint64_t ram_bytes_remaining(void)
 270 {
 271     return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
 272 }
 273
 274 uint64_t ram_dirty_sync_count(void)
 275 {
 276     return ram_state.bitmap_sync_count;
 277 }
 278
 279 uint64_t ram_dirty_pages_rate(void)
 280 {
 281     return ram_state.dirty_pages_rate;
 282 }
 283
 284 uint64_t ram_postcopy_requests(void)
 285 {
 286     return ram_state.postcopy_requests;
 287 }
 288
 289 /* used by the search for pages to send */
 290 struct PageSearchStatus {
 291     /* Current block being searched */
 292     RAMBlock    *block;
 293     /* Current page to search from */
 294     unsigned long page;
 295     /* Set once we wrap around */
 296     bool         complete_round;
 297 };
 298 typedef struct PageSearchStatus PageSearchStatus;
 299
 300 struct CompressParam {
 301     bool done;
 302     bool quit;
 303     QEMUFile *file;
 304     QemuMutex mutex;
 305     QemuCond cond;
 306     RAMBlock *block;
 307     ram_addr_t offset;
 308 };
 309 typedef struct CompressParam CompressParam;
 310
 311 struct DecompressParam {
 312     bool done;
 313     bool quit;
 314     QemuMutex mutex;
 315     QemuCond cond;
 316     void *des;
 317     uint8_t *compbuf;
 318     int len;
 319 };
 320 typedef struct DecompressParam DecompressParam;
 321
 322 static CompressParam *comp_param;
 323 static QemuThread *compress_threads;
 324 /* comp_done_cond is used to wake up the migration thread when
 325  * one of the compression threads has finished the compression.
 326  * comp_done_lock is used to co-work with comp_done_cond.
 327  */
 328 static QemuMutex comp_done_lock;
 329 static QemuCond comp_done_cond;
 330 /* The empty QEMUFileOps will be used by file in CompressParam */
 331 static const QEMUFileOps empty_ops = { };
 332
 333 static DecompressParam *decomp_param;
 334 static QemuThread *decompress_threads;
 335 static QemuMutex decomp_done_lock;
 336 static QemuCond decomp_done_cond;
 337
 338 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 339                                 ram_addr_t offset);
 340
 341 static void *do_data_compress(void *opaque)
 342 {
 343     CompressParam *param = opaque;
 344     RAMBlock *block;
 345     ram_addr_t offset;
 346
 347     qemu_mutex_lock(&param->mutex);
 348     while (!param->quit) {
 349         if (param->block) {
 350             block = param->block;
 351             offset = param->offset;
 352             param->block = NULL;
 353             qemu_mutex_unlock(&param->mutex);
 354
 355             do_compress_ram_page(param->file, block, offset);
 356
 357             qemu_mutex_lock(&comp_done_lock);
 358             param->done = true;
 359             qemu_cond_signal(&comp_done_cond);
 360             qemu_mutex_unlock(&comp_done_lock);
 361
 362             qemu_mutex_lock(&param->mutex);
 363         } else {
 364             qemu_cond_wait(&param->cond, &param->mutex);
 365         }
 366     }
 367     qemu_mutex_unlock(&param->mutex);
 368
 369     return NULL;
 370 }
 371
 372 static inline void terminate_compression_threads(void)
 373 {
 374     int idx, thread_count;
 375
 376     thread_count = migrate_compress_threads();
 377
 378     for (idx = 0; idx < thread_count; idx++) {
 379         qemu_mutex_lock(&comp_param[idx].mutex);
 380         comp_param[idx].quit = true;
 381         qemu_cond_signal(&comp_param[idx].cond);
 382         qemu_mutex_unlock(&comp_param[idx].mutex);
 383     }
 384 }
 385
 386 void migrate_compress_threads_join(void)
 387 {
 388     int i, thread_count;
 389
 390     if (!migrate_use_compression()) {
 391         return;
 392     }
 393     terminate_compression_threads();
 394     thread_count = migrate_compress_threads();
 395     for (i = 0; i < thread_count; i++) {
 396         qemu_thread_join(compress_threads + i);
 397         qemu_fclose(comp_param[i].file);
 398         qemu_mutex_destroy(&comp_param[i].mutex);
 399         qemu_cond_destroy(&comp_param[i].cond);
 400     }
 401     qemu_mutex_destroy(&comp_done_lock);
 402     qemu_cond_destroy(&comp_done_cond);
 403     g_free(compress_threads);
 404     g_free(comp_param);
 405     compress_threads = NULL;
 406     comp_param = NULL;
 407 }
 408
 409 void migrate_compress_threads_create(void)
 410 {
 411     int i, thread_count;
 412
 413     if (!migrate_use_compression()) {
 414         return;
 415     }
 416     thread_count = migrate_compress_threads();
 417     compress_threads = g_new0(QemuThread, thread_count);
 418     comp_param = g_new0(CompressParam, thread_count);
 419     qemu_cond_init(&comp_done_cond);
 420     qemu_mutex_init(&comp_done_lock);
 421     for (i = 0; i < thread_count; i++) {
 422         /* comp_param[i].file is just used as a dummy buffer to save data,
 423          * set its ops to empty.
 424          */
 425         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 426         comp_param[i].done = true;
 427         comp_param[i].quit = false;
 428         qemu_mutex_init(&comp_param[i].mutex);
 429         qemu_cond_init(&comp_param[i].cond);
 430         qemu_thread_create(compress_threads + i, "compress",
 431                            do_data_compress, comp_param + i,
 432                            QEMU_THREAD_JOINABLE);
 433     }
 434 }
 435
 436 /**
 437  * save_page_header: write page header to wire
 438  *
 439  * If this is the 1st block, it also writes the block identification
 440  *
 441  * Returns the number of bytes written
 442  *
 443  * @f: QEMUFile where to send the data
 444  * @block: block that contains the page we want to send
 445  * @offset: offset inside the block for the page
 446  *          in the lower bits, it contains flags
 447  */
 448 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 449                                ram_addr_t offset)
 450 {
 451     size_t size, len;
 452
 453     if (block == rs->last_sent_block) {
 454         offset |= RAM_SAVE_FLAG_CONTINUE;
 455     }
 456     qemu_put_be64(f, offset);
 457     size = 8;
 458
 459     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 460         len = strlen(block->idstr);
 461         qemu_put_byte(f, len);
 462         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 463         size += 1 + len;
 464         rs->last_sent_block = block;
 465     }
 466     return size;
 467 }
 468
 469 /**
 470  * mig_throttle_guest_down: throotle down the guest
 471  *
 472  * Reduce amount of guest cpu execution to hopefully slow down memory
 473  * writes. If guest dirty memory rate is reduced below the rate at
 474  * which we can transfer pages to the destination then we should be
 475  * able to complete migration. Some workloads dirty memory way too
 476  * fast and will not effectively converge, even with auto-converge.
 477  */
 478 static void mig_throttle_guest_down(void)
 479 {
 480     MigrationState *s = migrate_get_current();
 481     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 482     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 483
 484     /* We have not started throttling yet. Let's start it. */
 485     if (!cpu_throttle_active()) {
 486         cpu_throttle_set(pct_initial);
 487     } else {
 488         /* Throttling already on, just increase the rate */
 489         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 490     }
 491 }
 492
 493 /**
 494  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 495  *
 496  * @rs: current RAM state
 497  * @current_addr: address for the zero page
 498  *
 499  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 500  * The important thing is that a stale (not-yet-0'd) page be replaced
 501  * by the new data.
 502  * As a bonus, if the page wasn't in the cache it gets added so that
 503  * when a small write is made into the 0'd page it gets XBZRLE sent.
 504  */
 505 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 506 {
 507     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 508         return;
 509     }
 510
 511     /* We don't care if this fails to allocate a new cache page
 512      * as long as it updated an old one */
 513     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 514                  rs->bitmap_sync_count);
 515 }
 516
 517 #define ENCODING_FLAG_XBZRLE 0x1
 518
 519 /**
 520  * save_xbzrle_page: compress and send current page
 521  *
 522  * Returns: 1 means that we wrote the page
 523  *          0 means that page is identical to the one already sent
 524  *          -1 means that xbzrle would be longer than normal
 525  *
 526  * @rs: current RAM state
 527  * @current_data: pointer to the address of the page contents
 528  * @current_addr: addr of the page
 529  * @block: block that contains the page we want to send
 530  * @offset: offset inside the block for the page
 531  * @last_stage: if we are at the completion stage
 532  */
 533 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 534                             ram_addr_t current_addr, RAMBlock *block,
 535                             ram_addr_t offset, bool last_stage)
 536 {
 537     int encoded_len = 0, bytes_xbzrle;
 538     uint8_t *prev_cached_page;
 539
 540     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 541         rs->xbzrle_cache_miss++;
 542         if (!last_stage) {
 543             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 544                              rs->bitmap_sync_count) == -1) {
 545                 return -1;
 546             } else {
 547                 /* update *current_data when the page has been
 548                    inserted into cache */
 549                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 550             }
 551         }
 552         return -1;
 553     }
 554
 555     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 556
 557     /* save current buffer into memory */
 558     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 559
 560     /* XBZRLE encoding (if there is no overflow) */
 561     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 562                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 563                                        TARGET_PAGE_SIZE);
 564     if (encoded_len == 0) {
 565         trace_save_xbzrle_page_skipping();
 566         return 0;
 567     } else if (encoded_len == -1) {
 568         trace_save_xbzrle_page_overflow();
 569         rs->xbzrle_overflows++;
 570         /* update data in the cache */
 571         if (!last_stage) {
 572             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 573             *current_data = prev_cached_page;
 574         }
 575         return -1;
 576     }
 577
 578     /* we need to update the data in the cache, in order to get the same data */
 579     if (!last_stage) {
 580         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 581     }
 582
 583     /* Send XBZRLE based compressed page */
 584     bytes_xbzrle = save_page_header(rs, rs->f, block,
 585                                     offset | RAM_SAVE_FLAG_XBZRLE);
 586     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 587     qemu_put_be16(rs->f, encoded_len);
 588     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 589     bytes_xbzrle += encoded_len + 1 + 2;
 590     rs->xbzrle_pages++;
 591     rs->xbzrle_bytes += bytes_xbzrle;
 592     rs->bytes_transferred += bytes_xbzrle;
 593
 594     return 1;
 595 }
 596
 597 /**
 598  * migration_bitmap_find_dirty: find the next dirty page from start
 599  *
 600  * Called with rcu_read_lock() to protect migration_bitmap
 601  *
 602  * Returns the byte offset within memory region of the start of a dirty page
 603  *
 604  * @rs: current RAM state
 605  * @rb: RAMBlock where to search for dirty pages
 606  * @start: page where we start the search
 607  */
 608 static inline
 609 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 610                                           unsigned long start)
 611 {
 612     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 613     unsigned long *bitmap = rb->bmap;
 614     unsigned long next;
 615
 616     if (rs->ram_bulk_stage && start > 0) {
 617         next = start + 1;
 618     } else {
 619         next = find_next_bit(bitmap, size, start);
 620     }
 621
 622     return next;
 623 }
 624
 625 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 626                                                 RAMBlock *rb,
 627                                                 unsigned long page)
 628 {
 629     bool ret;
 630
 631     ret = test_and_clear_bit(page, rb->bmap);
 632
 633     if (ret) {
 634         rs->migration_dirty_pages--;
 635     }
 636     return ret;
 637 }
 638
 639 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 640                                         ram_addr_t start, ram_addr_t length)
 641 {
 642     rs->migration_dirty_pages +=
 643         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 644                                               &rs->num_dirty_pages_period);
 645 }
 646
 647 /**
 648  * ram_pagesize_summary: calculate all the pagesizes of a VM
 649  *
 650  * Returns a summary bitmap of the page sizes of all RAMBlocks
 651  *
 652  * For VMs with just normal pages this is equivalent to the host page
 653  * size. If it's got some huge pages then it's the OR of all the
 654  * different page sizes.
 655  */
 656 uint64_t ram_pagesize_summary(void)
 657 {
 658     RAMBlock *block;
 659     uint64_t summary = 0;
 660
 661     RAMBLOCK_FOREACH(block) {
 662         summary |= block->page_size;
 663     }
 664
 665     return summary;
 666 }
 667
 668 static void migration_bitmap_sync(RAMState *rs)
 669 {
 670     RAMBlock *block;
 671     int64_t end_time;
 672     uint64_t bytes_xfer_now;
 673
 674     rs->bitmap_sync_count++;
 675
 676     if (!rs->time_last_bitmap_sync) {
 677         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 678     }
 679
 680     trace_migration_bitmap_sync_start();
 681     memory_global_dirty_log_sync();
 682
 683     qemu_mutex_lock(&rs->bitmap_mutex);
 684     rcu_read_lock();
 685     RAMBLOCK_FOREACH(block) {
 686         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 687     }
 688     rcu_read_unlock();
 689     qemu_mutex_unlock(&rs->bitmap_mutex);
 690
 691     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 692
 693     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 694
 695     /* more than 1 second = 1000 millisecons */
 696     if (end_time > rs->time_last_bitmap_sync + 1000) {
 697         /* calculate period counters */
 698         rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 699             / (end_time - rs->time_last_bitmap_sync);
 700         bytes_xfer_now = ram_bytes_transferred();
 701
 702         if (migrate_auto_converge()) {
 703             /* The following detection logic can be refined later. For now:
 704                Check to see if the dirtied bytes is 50% more than the approx.
 705                amount of bytes that just got transferred since the last time we
 706                were in this routine. If that happens twice, start or increase
 707                throttling */
 708
 709             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 710                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 711                 (++rs->dirty_rate_high_cnt >= 2)) {
 712                     trace_migration_throttle();
 713                     rs->dirty_rate_high_cnt = 0;
 714                     mig_throttle_guest_down();
 715             }
 716         }
 717
 718         if (migrate_use_xbzrle()) {
 719             if (rs->iterations_prev != rs->iterations) {
 720                 rs->xbzrle_cache_miss_rate =
 721                    (double)(rs->xbzrle_cache_miss -
 722                             rs->xbzrle_cache_miss_prev) /
 723                    (rs->iterations - rs->iterations_prev);
 724             }
 725             rs->iterations_prev = rs->iterations;
 726             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
 727         }
 728
 729         /* reset period counters */
 730         rs->time_last_bitmap_sync = end_time;
 731         rs->num_dirty_pages_period = 0;
 732         rs->bytes_xfer_prev = bytes_xfer_now;
 733     }
 734     if (migrate_use_events()) {
 735         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 736     }
 737 }
 738
 739 /**
 740  * save_zero_page: send the zero page to the stream
 741  *
 742  * Returns the number of pages written.
 743  *
 744  * @rs: current RAM state
 745  * @block: block that contains the page we want to send
 746  * @offset: offset inside the block for the page
 747  * @p: pointer to the page
 748  */
 749 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 750                           uint8_t *p)
 751 {
 752     int pages = -1;
 753
 754     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 755         rs->zero_pages++;
 756         rs->bytes_transferred +=
 757             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 758         qemu_put_byte(rs->f, 0);
 759         rs->bytes_transferred += 1;
 760         pages = 1;
 761     }
 762
 763     return pages;
 764 }
 765
 766 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 767 {
 768     if (!migrate_release_ram() || !migration_in_postcopy()) {
 769         return;
 770     }
 771
 772     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 773 }
 774
 775 /**
 776  * ram_save_page: send the given page to the stream
 777  *
 778  * Returns the number of pages written.
 779  *          < 0 - error
 780  *          >=0 - Number of pages written - this might legally be 0
 781  *                if xbzrle noticed the page was the same.
 782  *
 783  * @rs: current RAM state
 784  * @block: block that contains the page we want to send
 785  * @offset: offset inside the block for the page
 786  * @last_stage: if we are at the completion stage
 787  */
 788 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 789 {
 790     int pages = -1;
 791     uint64_t bytes_xmit;
 792     ram_addr_t current_addr;
 793     uint8_t *p;
 794     int ret;
 795     bool send_async = true;
 796     RAMBlock *block = pss->block;
 797     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 798
 799     p = block->host + offset;
 800     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 801
 802     /* In doubt sent page as normal */
 803     bytes_xmit = 0;
 804     ret = ram_control_save_page(rs->f, block->offset,
 805                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 806     if (bytes_xmit) {
 807         rs->bytes_transferred += bytes_xmit;
 808         pages = 1;
 809     }
 810
 811     XBZRLE_cache_lock();
 812
 813     current_addr = block->offset + offset;
 814
 815     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 816         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 817             if (bytes_xmit > 0) {
 818                 rs->norm_pages++;
 819             } else if (bytes_xmit == 0) {
 820                 rs->zero_pages++;
 821             }
 822         }
 823     } else {
 824         pages = save_zero_page(rs, block, offset, p);
 825         if (pages > 0) {
 826             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 827              * page would be stale
 828              */
 829             xbzrle_cache_zero_page(rs, current_addr);
 830             ram_release_pages(block->idstr, offset, pages);
 831         } else if (!rs->ram_bulk_stage &&
 832                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 833             pages = save_xbzrle_page(rs, &p, current_addr, block,
 834                                      offset, last_stage);
 835             if (!last_stage) {
 836                 /* Can't send this cached data async, since the cache page
 837                  * might get updated before it gets to the wire
 838                  */
 839                 send_async = false;
 840             }
 841         }
 842     }
 843
 844     /* XBZRLE overflow or normal page */
 845     if (pages == -1) {
 846         rs->bytes_transferred += save_page_header(rs, rs->f, block,
 847                                                   offset | RAM_SAVE_FLAG_PAGE);
 848         if (send_async) {
 849             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 850                                   migrate_release_ram() &
 851                                   migration_in_postcopy());
 852         } else {
 853             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 854         }
 855         rs->bytes_transferred += TARGET_PAGE_SIZE;
 856         pages = 1;
 857         rs->norm_pages++;
 858     }
 859
 860     XBZRLE_cache_unlock();
 861
 862     return pages;
 863 }
 864
 865 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 866                                 ram_addr_t offset)
 867 {
 868     RAMState *rs = &ram_state;
 869     int bytes_sent, blen;
 870     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 871
 872     bytes_sent = save_page_header(rs, f, block, offset |
 873                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 874     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 875                                      migrate_compress_level());
 876     if (blen < 0) {
 877         bytes_sent = 0;
 878         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 879         error_report("compressed data failed!");
 880     } else {
 881         bytes_sent += blen;
 882         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 883     }
 884
 885     return bytes_sent;
 886 }
 887
 888 static void flush_compressed_data(RAMState *rs)
 889 {
 890     int idx, len, thread_count;
 891
 892     if (!migrate_use_compression()) {
 893         return;
 894     }
 895     thread_count = migrate_compress_threads();
 896
 897     qemu_mutex_lock(&comp_done_lock);
 898     for (idx = 0; idx < thread_count; idx++) {
 899         while (!comp_param[idx].done) {
 900             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 901         }
 902     }
 903     qemu_mutex_unlock(&comp_done_lock);
 904
 905     for (idx = 0; idx < thread_count; idx++) {
 906         qemu_mutex_lock(&comp_param[idx].mutex);
 907         if (!comp_param[idx].quit) {
 908             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 909             rs->bytes_transferred += len;
 910         }
 911         qemu_mutex_unlock(&comp_param[idx].mutex);
 912     }
 913 }
 914
 915 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 916                                        ram_addr_t offset)
 917 {
 918     param->block = block;
 919     param->offset = offset;
 920 }
 921
 922 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 923                                            ram_addr_t offset)
 924 {
 925     int idx, thread_count, bytes_xmit = -1, pages = -1;
 926
 927     thread_count = migrate_compress_threads();
 928     qemu_mutex_lock(&comp_done_lock);
 929     while (true) {
 930         for (idx = 0; idx < thread_count; idx++) {
 931             if (comp_param[idx].done) {
 932                 comp_param[idx].done = false;
 933                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 934                 qemu_mutex_lock(&comp_param[idx].mutex);
 935                 set_compress_params(&comp_param[idx], block, offset);
 936                 qemu_cond_signal(&comp_param[idx].cond);
 937                 qemu_mutex_unlock(&comp_param[idx].mutex);
 938                 pages = 1;
 939                 rs->norm_pages++;
 940                 rs->bytes_transferred += bytes_xmit;
 941                 break;
 942             }
 943         }
 944         if (pages > 0) {
 945             break;
 946         } else {
 947             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 948         }
 949     }
 950     qemu_mutex_unlock(&comp_done_lock);
 951
 952     return pages;
 953 }
 954
 955 /**
 956  * ram_save_compressed_page: compress the given page and send it to the stream
 957  *
 958  * Returns the number of pages written.
 959  *
 960  * @rs: current RAM state
 961  * @block: block that contains the page we want to send
 962  * @offset: offset inside the block for the page
 963  * @last_stage: if we are at the completion stage
 964  */
 965 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 966                                     bool last_stage)
 967 {
 968     int pages = -1;
 969     uint64_t bytes_xmit = 0;
 970     uint8_t *p;
 971     int ret, blen;
 972     RAMBlock *block = pss->block;
 973     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 974
 975     p = block->host + offset;
 976
 977     ret = ram_control_save_page(rs->f, block->offset,
 978                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 979     if (bytes_xmit) {
 980         rs->bytes_transferred += bytes_xmit;
 981         pages = 1;
 982     }
 983     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 984         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 985             if (bytes_xmit > 0) {
 986                 rs->norm_pages++;
 987             } else if (bytes_xmit == 0) {
 988                 rs->zero_pages++;
 989             }
 990         }
 991     } else {
 992         /* When starting the process of a new block, the first page of
 993          * the block should be sent out before other pages in the same
 994          * block, and all the pages in last block should have been sent
 995          * out, keeping this order is important, because the 'cont' flag
 996          * is used to avoid resending the block name.
 997          */
 998         if (block != rs->last_sent_block) {
 999             flush_compressed_data(rs);
1000             pages = save_zero_page(rs, block, offset, p);
1001             if (pages == -1) {
1002                 /* Make sure the first page is sent out before other pages */
1003                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1004                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1005                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1006                                                  migrate_compress_level());
1007                 if (blen > 0) {
1008                     rs->bytes_transferred += bytes_xmit + blen;
1009                     rs->norm_pages++;
1010                     pages = 1;
1011                 } else {
1012                     qemu_file_set_error(rs->f, blen);
1013                     error_report("compressed data failed!");
1014                 }
1015             }
1016             if (pages > 0) {
1017                 ram_release_pages(block->idstr, offset, pages);
1018             }
1019         } else {
1020             pages = save_zero_page(rs, block, offset, p);
1021             if (pages == -1) {
1022                 pages = compress_page_with_multi_thread(rs, block, offset);
1023             } else {
1024                 ram_release_pages(block->idstr, offset, pages);
1025             }
1026         }
1027     }
1028
1029     return pages;
1030 }
1031
1032 /**
1033  * find_dirty_block: find the next dirty page and update any state
1034  * associated with the search process.
1035  *
1036  * Returns if a page is found
1037  *
1038  * @rs: current RAM state
1039  * @pss: data about the state of the current dirty page scan
1040  * @again: set to false if the search has scanned the whole of RAM
1041  */
1042 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1043 {
1044     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1045     if (pss->complete_round && pss->block == rs->last_seen_block &&
1046         pss->page >= rs->last_page) {
1047         /*
1048          * We've been once around the RAM and haven't found anything.
1049          * Give up.
1050          */
1051         *again = false;
1052         return false;
1053     }
1054     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1055         /* Didn't find anything in this RAM Block */
1056         pss->page = 0;
1057         pss->block = QLIST_NEXT_RCU(pss->block, next);
1058         if (!pss->block) {
1059             /* Hit the end of the list */
1060             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1061             /* Flag that we've looped */
1062             pss->complete_round = true;
1063             rs->ram_bulk_stage = false;
1064             if (migrate_use_xbzrle()) {
1065                 /* If xbzrle is on, stop using the data compression at this
1066                  * point. In theory, xbzrle can do better than compression.
1067                  */
1068                 flush_compressed_data(rs);
1069             }
1070         }
1071         /* Didn't find anything this time, but try again on the new block */
1072         *again = true;
1073         return false;
1074     } else {
1075         /* Can go around again, but... */
1076         *again = true;
1077         /* We've found something so probably don't need to */
1078         return true;
1079     }
1080 }
1081
1082 /**
1083  * unqueue_page: gets a page of the queue
1084  *
1085  * Helper for 'get_queued_page' - gets a page off the queue
1086  *
1087  * Returns the block of the page (or NULL if none available)
1088  *
1089  * @rs: current RAM state
1090  * @offset: used to return the offset within the RAMBlock
1091  */
1092 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1093 {
1094     RAMBlock *block = NULL;
1095
1096     qemu_mutex_lock(&rs->src_page_req_mutex);
1097     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1098         struct RAMSrcPageRequest *entry =
1099                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1100         block = entry->rb;
1101         *offset = entry->offset;
1102
1103         if (entry->len > TARGET_PAGE_SIZE) {
1104             entry->len -= TARGET_PAGE_SIZE;
1105             entry->offset += TARGET_PAGE_SIZE;
1106         } else {
1107             memory_region_unref(block->mr);
1108             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1109             g_free(entry);
1110         }
1111     }
1112     qemu_mutex_unlock(&rs->src_page_req_mutex);
1113
1114     return block;
1115 }
1116
1117 /**
1118  * get_queued_page: unqueue a page from the postocpy requests
1119  *
1120  * Skips pages that are already sent (!dirty)
1121  *
1122  * Returns if a queued page is found
1123  *
1124  * @rs: current RAM state
1125  * @pss: data about the state of the current dirty page scan
1126  */
1127 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1128 {
1129     RAMBlock  *block;
1130     ram_addr_t offset;
1131     bool dirty;
1132
1133     do {
1134         block = unqueue_page(rs, &offset);
1135         /*
1136          * We're sending this page, and since it's postcopy nothing else
1137          * will dirty it, and we must make sure it doesn't get sent again
1138          * even if this queue request was received after the background
1139          * search already sent it.
1140          */
1141         if (block) {
1142             unsigned long page;
1143
1144             page = offset >> TARGET_PAGE_BITS;
1145             dirty = test_bit(page, block->bmap);
1146             if (!dirty) {
1147                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1148                        page, test_bit(page, block->unsentmap));
1149             } else {
1150                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1151             }
1152         }
1153
1154     } while (block && !dirty);
1155
1156     if (block) {
1157         /*
1158          * As soon as we start servicing pages out of order, then we have
1159          * to kill the bulk stage, since the bulk stage assumes
1160          * in (migration_bitmap_find_and_reset_dirty) that every page is
1161          * dirty, that's no longer true.
1162          */
1163         rs->ram_bulk_stage = false;
1164
1165         /*
1166          * We want the background search to continue from the queued page
1167          * since the guest is likely to want other pages near to the page
1168          * it just requested.
1169          */
1170         pss->block = block;
1171         pss->page = offset >> TARGET_PAGE_BITS;
1172     }
1173
1174     return !!block;
1175 }
1176
1177 /**
1178  * migration_page_queue_free: drop any remaining pages in the ram
1179  * request queue
1180  *
1181  * It should be empty at the end anyway, but in error cases there may
1182  * be some left.  in case that there is any page left, we drop it.
1183  *
1184  */
1185 void migration_page_queue_free(void)
1186 {
1187     struct RAMSrcPageRequest *mspr, *next_mspr;
1188     RAMState *rs = &ram_state;
1189     /* This queue generally should be empty - but in the case of a failed
1190      * migration might have some droppings in.
1191      */
1192     rcu_read_lock();
1193     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1194         memory_region_unref(mspr->rb->mr);
1195         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1196         g_free(mspr);
1197     }
1198     rcu_read_unlock();
1199 }
1200
1201 /**
1202  * ram_save_queue_pages: queue the page for transmission
1203  *
1204  * A request from postcopy destination for example.
1205  *
1206  * Returns zero on success or negative on error
1207  *
1208  * @rbname: Name of the RAMBLock of the request. NULL means the
1209  *          same that last one.
1210  * @start: starting address from the start of the RAMBlock
1211  * @len: length (in bytes) to send
1212  */
1213 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1214 {
1215     RAMBlock *ramblock;
1216     RAMState *rs = &ram_state;
1217
1218     rs->postcopy_requests++;
1219     rcu_read_lock();
1220     if (!rbname) {
1221         /* Reuse last RAMBlock */
1222         ramblock = rs->last_req_rb;
1223
1224         if (!ramblock) {
1225             /*
1226              * Shouldn't happen, we can't reuse the last RAMBlock if
1227              * it's the 1st request.
1228              */
1229             error_report("ram_save_queue_pages no previous block");
1230             goto err;
1231         }
1232     } else {
1233         ramblock = qemu_ram_block_by_name(rbname);
1234
1235         if (!ramblock) {
1236             /* We shouldn't be asked for a non-existent RAMBlock */
1237             error_report("ram_save_queue_pages no block '%s'", rbname);
1238             goto err;
1239         }
1240         rs->last_req_rb = ramblock;
1241     }
1242     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1243     if (start+len > ramblock->used_length) {
1244         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1245                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1246                      __func__, start, len, ramblock->used_length);
1247         goto err;
1248     }
1249
1250     struct RAMSrcPageRequest *new_entry =
1251         g_malloc0(sizeof(struct RAMSrcPageRequest));
1252     new_entry->rb = ramblock;
1253     new_entry->offset = start;
1254     new_entry->len = len;
1255
1256     memory_region_ref(ramblock->mr);
1257     qemu_mutex_lock(&rs->src_page_req_mutex);
1258     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1259     qemu_mutex_unlock(&rs->src_page_req_mutex);
1260     rcu_read_unlock();
1261
1262     return 0;
1263
1264 err:
1265     rcu_read_unlock();
1266     return -1;
1267 }
1268
1269 /**
1270  * ram_save_target_page: save one target page
1271  *
1272  * Returns the number of pages written
1273  *
1274  * @rs: current RAM state
1275  * @ms: current migration state
1276  * @pss: data about the page we want to send
1277  * @last_stage: if we are at the completion stage
1278  */
1279 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1280                                 bool last_stage)
1281 {
1282     int res = 0;
1283
1284     /* Check the pages is dirty and if it is send it */
1285     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1286         /*
1287          * If xbzrle is on, stop using the data compression after first
1288          * round of migration even if compression is enabled. In theory,
1289          * xbzrle can do better than compression.
1290          */
1291         if (migrate_use_compression() &&
1292             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1293             res = ram_save_compressed_page(rs, pss, last_stage);
1294         } else {
1295             res = ram_save_page(rs, pss, last_stage);
1296         }
1297
1298         if (res < 0) {
1299             return res;
1300         }
1301         if (pss->block->unsentmap) {
1302             clear_bit(pss->page, pss->block->unsentmap);
1303         }
1304     }
1305
1306     return res;
1307 }
1308
1309 /**
1310  * ram_save_host_page: save a whole host page
1311  *
1312  * Starting at *offset send pages up to the end of the current host
1313  * page. It's valid for the initial offset to point into the middle of
1314  * a host page in which case the remainder of the hostpage is sent.
1315  * Only dirty target pages are sent. Note that the host page size may
1316  * be a huge page for this block.
1317  * The saving stops at the boundary of the used_length of the block
1318  * if the RAMBlock isn't a multiple of the host page size.
1319  *
1320  * Returns the number of pages written or negative on error
1321  *
1322  * @rs: current RAM state
1323  * @ms: current migration state
1324  * @pss: data about the page we want to send
1325  * @last_stage: if we are at the completion stage
1326  */
1327 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1328                               bool last_stage)
1329 {
1330     int tmppages, pages = 0;
1331     size_t pagesize_bits =
1332         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1333
1334     do {
1335         tmppages = ram_save_target_page(rs, pss, last_stage);
1336         if (tmppages < 0) {
1337             return tmppages;
1338         }
1339
1340         pages += tmppages;
1341         pss->page++;
1342     } while ((pss->page & (pagesize_bits - 1)) &&
1343              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1344
1345     /* The offset we leave with is the last one we looked at */
1346     pss->page--;
1347     return pages;
1348 }
1349
1350 /**
1351  * ram_find_and_save_block: finds a dirty page and sends it to f
1352  *
1353  * Called within an RCU critical section.
1354  *
1355  * Returns the number of pages written where zero means no dirty pages
1356  *
1357  * @rs: current RAM state
1358  * @last_stage: if we are at the completion stage
1359  *
1360  * On systems where host-page-size > target-page-size it will send all the
1361  * pages in a host page that are dirty.
1362  */
1363
1364 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1365 {
1366     PageSearchStatus pss;
1367     int pages = 0;
1368     bool again, found;
1369
1370     /* No dirty page as there is zero RAM */
1371     if (!ram_bytes_total()) {
1372         return pages;
1373     }
1374
1375     pss.block = rs->last_seen_block;
1376     pss.page = rs->last_page;
1377     pss.complete_round = false;
1378
1379     if (!pss.block) {
1380         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1381     }
1382
1383     do {
1384         again = true;
1385         found = get_queued_page(rs, &pss);
1386
1387         if (!found) {
1388             /* priority queue empty, so just search for something dirty */
1389             found = find_dirty_block(rs, &pss, &again);
1390         }
1391
1392         if (found) {
1393             pages = ram_save_host_page(rs, &pss, last_stage);
1394         }
1395     } while (!pages && again);
1396
1397     rs->last_seen_block = pss.block;
1398     rs->last_page = pss.page;
1399
1400     return pages;
1401 }
1402
1403 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1404 {
1405     uint64_t pages = size / TARGET_PAGE_SIZE;
1406     RAMState *rs = &ram_state;
1407
1408     if (zero) {
1409         rs->zero_pages += pages;
1410     } else {
1411         rs->norm_pages += pages;
1412         rs->bytes_transferred += size;
1413         qemu_update_position(f, size);
1414     }
1415 }
1416
1417 uint64_t ram_bytes_total(void)
1418 {
1419     RAMBlock *block;
1420     uint64_t total = 0;
1421
1422     rcu_read_lock();
1423     RAMBLOCK_FOREACH(block) {
1424         total += block->used_length;
1425     }
1426     rcu_read_unlock();
1427     return total;
1428 }
1429
1430 void free_xbzrle_decoded_buf(void)
1431 {
1432     g_free(xbzrle_decoded_buf);
1433     xbzrle_decoded_buf = NULL;
1434 }
1435
1436 static void ram_migration_cleanup(void *opaque)
1437 {
1438     RAMBlock *block;
1439
1440     /* caller have hold iothread lock or is in a bh, so there is
1441      * no writing race against this migration_bitmap
1442      */
1443     memory_global_dirty_log_stop();
1444
1445     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1446         g_free(block->bmap);
1447         block->bmap = NULL;
1448         g_free(block->unsentmap);
1449         block->unsentmap = NULL;
1450     }
1451
1452     XBZRLE_cache_lock();
1453     if (XBZRLE.cache) {
1454         cache_fini(XBZRLE.cache);
1455         g_free(XBZRLE.encoded_buf);
1456         g_free(XBZRLE.current_buf);
1457         g_free(ZERO_TARGET_PAGE);
1458         XBZRLE.cache = NULL;
1459         XBZRLE.encoded_buf = NULL;
1460         XBZRLE.current_buf = NULL;
1461     }
1462     XBZRLE_cache_unlock();
1463 }
1464
1465 static void ram_state_reset(RAMState *rs)
1466 {
1467     rs->last_seen_block = NULL;
1468     rs->last_sent_block = NULL;
1469     rs->last_page = 0;
1470     rs->last_version = ram_list.version;
1471     rs->ram_bulk_stage = true;
1472 }
1473
1474 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1475
1476 /*
1477  * 'expected' is the value you expect the bitmap mostly to be full
1478  * of; it won't bother printing lines that are all this value.
1479  * If 'todump' is null the migration bitmap is dumped.
1480  */
1481 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1482                            unsigned long pages)
1483 {
1484     int64_t cur;
1485     int64_t linelen = 128;
1486     char linebuf[129];
1487
1488     for (cur = 0; cur < pages; cur += linelen) {
1489         int64_t curb;
1490         bool found = false;
1491         /*
1492          * Last line; catch the case where the line length
1493          * is longer than remaining ram
1494          */
1495         if (cur + linelen > pages) {
1496             linelen = pages - cur;
1497         }
1498         for (curb = 0; curb < linelen; curb++) {
1499             bool thisbit = test_bit(cur + curb, todump);
1500             linebuf[curb] = thisbit ? '1' : '.';
1501             found = found || (thisbit != expected);
1502         }
1503         if (found) {
1504             linebuf[curb] = '\0';
1505             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1506         }
1507     }
1508 }
1509
1510 /* **** functions for postcopy ***** */
1511
1512 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1513 {
1514     struct RAMBlock *block;
1515
1516     RAMBLOCK_FOREACH(block) {
1517         unsigned long *bitmap = block->bmap;
1518         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1519         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1520
1521         while (run_start < range) {
1522             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1523             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1524                               (run_end - run_start) << TARGET_PAGE_BITS);
1525             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1526         }
1527     }
1528 }
1529
1530 /**
1531  * postcopy_send_discard_bm_ram: discard a RAMBlock
1532  *
1533  * Returns zero on success
1534  *
1535  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1536  * Note: At this point the 'unsentmap' is the processed bitmap combined
1537  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1538  *
1539  * @ms: current migration state
1540  * @pds: state for postcopy
1541  * @start: RAMBlock starting page
1542  * @length: RAMBlock size
1543  */
1544 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1545                                         PostcopyDiscardState *pds,
1546                                         RAMBlock *block)
1547 {
1548     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1549     unsigned long current;
1550     unsigned long *unsentmap = block->unsentmap;
1551
1552     for (current = 0; current < end; ) {
1553         unsigned long one = find_next_bit(unsentmap, end, current);
1554
1555         if (one <= end) {
1556             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1557             unsigned long discard_length;
1558
1559             if (zero >= end) {
1560                 discard_length = end - one;
1561             } else {
1562                 discard_length = zero - one;
1563             }
1564             if (discard_length) {
1565                 postcopy_discard_send_range(ms, pds, one, discard_length);
1566             }
1567             current = one + discard_length;
1568         } else {
1569             current = one;
1570         }
1571     }
1572
1573     return 0;
1574 }
1575
1576 /**
1577  * postcopy_each_ram_send_discard: discard all RAMBlocks
1578  *
1579  * Returns 0 for success or negative for error
1580  *
1581  * Utility for the outgoing postcopy code.
1582  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1583  *   passing it bitmap indexes and name.
1584  * (qemu_ram_foreach_block ends up passing unscaled lengths
1585  *  which would mean postcopy code would have to deal with target page)
1586  *
1587  * @ms: current migration state
1588  */
1589 static int postcopy_each_ram_send_discard(MigrationState *ms)
1590 {
1591     struct RAMBlock *block;
1592     int ret;
1593
1594     RAMBLOCK_FOREACH(block) {
1595         PostcopyDiscardState *pds =
1596             postcopy_discard_send_init(ms, block->idstr);
1597
1598         /*
1599          * Postcopy sends chunks of bitmap over the wire, but it
1600          * just needs indexes at this point, avoids it having
1601          * target page specific code.
1602          */
1603         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1604         postcopy_discard_send_finish(ms, pds);
1605         if (ret) {
1606             return ret;
1607         }
1608     }
1609
1610     return 0;
1611 }
1612
1613 /**
1614  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1615  *
1616  * Helper for postcopy_chunk_hostpages; it's called twice to
1617  * canonicalize the two bitmaps, that are similar, but one is
1618  * inverted.
1619  *
1620  * Postcopy requires that all target pages in a hostpage are dirty or
1621  * clean, not a mix.  This function canonicalizes the bitmaps.
1622  *
1623  * @ms: current migration state
1624  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1625  *               otherwise we need to canonicalize partially dirty host pages
1626  * @block: block that contains the page we want to canonicalize
1627  * @pds: state for postcopy
1628  */
1629 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1630                                           RAMBlock *block,
1631                                           PostcopyDiscardState *pds)
1632 {
1633     RAMState *rs = &ram_state;
1634     unsigned long *bitmap = block->bmap;
1635     unsigned long *unsentmap = block->unsentmap;
1636     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1637     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1638     unsigned long run_start;
1639
1640     if (block->page_size == TARGET_PAGE_SIZE) {
1641         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1642         return;
1643     }
1644
1645     if (unsent_pass) {
1646         /* Find a sent page */
1647         run_start = find_next_zero_bit(unsentmap, pages, 0);
1648     } else {
1649         /* Find a dirty page */
1650         run_start = find_next_bit(bitmap, pages, 0);
1651     }
1652
1653     while (run_start < pages) {
1654         bool do_fixup = false;
1655         unsigned long fixup_start_addr;
1656         unsigned long host_offset;
1657
1658         /*
1659          * If the start of this run of pages is in the middle of a host
1660          * page, then we need to fixup this host page.
1661          */
1662         host_offset = run_start % host_ratio;
1663         if (host_offset) {
1664             do_fixup = true;
1665             run_start -= host_offset;
1666             fixup_start_addr = run_start;
1667             /* For the next pass */
1668             run_start = run_start + host_ratio;
1669         } else {
1670             /* Find the end of this run */
1671             unsigned long run_end;
1672             if (unsent_pass) {
1673                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1674             } else {
1675                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1676             }
1677             /*
1678              * If the end isn't at the start of a host page, then the
1679              * run doesn't finish at the end of a host page
1680              * and we need to discard.
1681              */
1682             host_offset = run_end % host_ratio;
1683             if (host_offset) {
1684                 do_fixup = true;
1685                 fixup_start_addr = run_end - host_offset;
1686                 /*
1687                  * This host page has gone, the next loop iteration starts
1688                  * from after the fixup
1689                  */
1690                 run_start = fixup_start_addr + host_ratio;
1691             } else {
1692                 /*
1693                  * No discards on this iteration, next loop starts from
1694                  * next sent/dirty page
1695                  */
1696                 run_start = run_end + 1;
1697             }
1698         }
1699
1700         if (do_fixup) {
1701             unsigned long page;
1702
1703             /* Tell the destination to discard this page */
1704             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1705                 /* For the unsent_pass we:
1706                  *     discard partially sent pages
1707                  * For the !unsent_pass (dirty) we:
1708                  *     discard partially dirty pages that were sent
1709                  *     (any partially sent pages were already discarded
1710                  *     by the previous unsent_pass)
1711                  */
1712                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1713                                             host_ratio);
1714             }
1715
1716             /* Clean up the bitmap */
1717             for (page = fixup_start_addr;
1718                  page < fixup_start_addr + host_ratio; page++) {
1719                 /* All pages in this host page are now not sent */
1720                 set_bit(page, unsentmap);
1721
1722                 /*
1723                  * Remark them as dirty, updating the count for any pages
1724                  * that weren't previously dirty.
1725                  */
1726                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1727             }
1728         }
1729
1730         if (unsent_pass) {
1731             /* Find the next sent page for the next iteration */
1732             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1733         } else {
1734             /* Find the next dirty page for the next iteration */
1735             run_start = find_next_bit(bitmap, pages, run_start);
1736         }
1737     }
1738 }
1739
1740 /**
1741  * postcopy_chuck_hostpages: discrad any partially sent host page
1742  *
1743  * Utility for the outgoing postcopy code.
1744  *
1745  * Discard any partially sent host-page size chunks, mark any partially
1746  * dirty host-page size chunks as all dirty.  In this case the host-page
1747  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1748  *
1749  * Returns zero on success
1750  *
1751  * @ms: current migration state
1752  * @block: block we want to work with
1753  */
1754 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1755 {
1756     PostcopyDiscardState *pds =
1757         postcopy_discard_send_init(ms, block->idstr);
1758
1759     /* First pass: Discard all partially sent host pages */
1760     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1761     /*
1762      * Second pass: Ensure that all partially dirty host pages are made
1763      * fully dirty.
1764      */
1765     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1766
1767     postcopy_discard_send_finish(ms, pds);
1768     return 0;
1769 }
1770
1771 /**
1772  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1773  *
1774  * Returns zero on success
1775  *
1776  * Transmit the set of pages to be discarded after precopy to the target
1777  * these are pages that:
1778  *     a) Have been previously transmitted but are now dirty again
1779  *     b) Pages that have never been transmitted, this ensures that
1780  *        any pages on the destination that have been mapped by background
1781  *        tasks get discarded (transparent huge pages is the specific concern)
1782  * Hopefully this is pretty sparse
1783  *
1784  * @ms: current migration state
1785  */
1786 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1787 {
1788     RAMState *rs = &ram_state;
1789     RAMBlock *block;
1790     int ret;
1791
1792     rcu_read_lock();
1793
1794     /* This should be our last sync, the src is now paused */
1795     migration_bitmap_sync(rs);
1796
1797     /* Easiest way to make sure we don't resume in the middle of a host-page */
1798     rs->last_seen_block = NULL;
1799     rs->last_sent_block = NULL;
1800     rs->last_page = 0;
1801
1802     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1803         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1804         unsigned long *bitmap = block->bmap;
1805         unsigned long *unsentmap = block->unsentmap;
1806
1807         if (!unsentmap) {
1808             /* We don't have a safe way to resize the sentmap, so
1809              * if the bitmap was resized it will be NULL at this
1810              * point.
1811              */
1812             error_report("migration ram resized during precopy phase");
1813             rcu_read_unlock();
1814             return -EINVAL;
1815         }
1816         /* Deal with TPS != HPS and huge pages */
1817         ret = postcopy_chunk_hostpages(ms, block);
1818         if (ret) {
1819             rcu_read_unlock();
1820             return ret;
1821         }
1822
1823         /*
1824          * Update the unsentmap to be unsentmap = unsentmap | dirty
1825          */
1826         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1827 #ifdef DEBUG_POSTCOPY
1828         ram_debug_dump_bitmap(unsentmap, true, pages);
1829 #endif
1830     }
1831     trace_ram_postcopy_send_discard_bitmap();
1832
1833     ret = postcopy_each_ram_send_discard(ms);
1834     rcu_read_unlock();
1835
1836     return ret;
1837 }
1838
1839 /**
1840  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1841  *
1842  * Returns zero on success
1843  *
1844  * @rbname: name of the RAMBlock of the request. NULL means the
1845  *          same that last one.
1846  * @start: RAMBlock starting page
1847  * @length: RAMBlock size
1848  */
1849 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1850 {
1851     int ret = -1;
1852
1853     trace_ram_discard_range(rbname, start, length);
1854
1855     rcu_read_lock();
1856     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1857
1858     if (!rb) {
1859         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1860         goto err;
1861     }
1862
1863     ret = ram_block_discard_range(rb, start, length);
1864
1865 err:
1866     rcu_read_unlock();
1867
1868     return ret;
1869 }
1870
1871 static int ram_state_init(RAMState *rs)
1872 {
1873     memset(rs, 0, sizeof(*rs));
1874     qemu_mutex_init(&rs->bitmap_mutex);
1875     qemu_mutex_init(&rs->src_page_req_mutex);
1876     QSIMPLEQ_INIT(&rs->src_page_requests);
1877
1878     if (migrate_use_xbzrle()) {
1879         XBZRLE_cache_lock();
1880         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1881         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1882                                   TARGET_PAGE_SIZE,
1883                                   TARGET_PAGE_SIZE);
1884         if (!XBZRLE.cache) {
1885             XBZRLE_cache_unlock();
1886             error_report("Error creating cache");
1887             return -1;
1888         }
1889         XBZRLE_cache_unlock();
1890
1891         /* We prefer not to abort if there is no memory */
1892         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1893         if (!XBZRLE.encoded_buf) {
1894             error_report("Error allocating encoded_buf");
1895             return -1;
1896         }
1897
1898         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1899         if (!XBZRLE.current_buf) {
1900             error_report("Error allocating current_buf");
1901             g_free(XBZRLE.encoded_buf);
1902             XBZRLE.encoded_buf = NULL;
1903             return -1;
1904         }
1905     }
1906
1907     /* For memory_global_dirty_log_start below.  */
1908     qemu_mutex_lock_iothread();
1909
1910     qemu_mutex_lock_ramlist();
1911     rcu_read_lock();
1912     ram_state_reset(rs);
1913
1914     /* Skip setting bitmap if there is no RAM */
1915     if (ram_bytes_total()) {
1916         RAMBlock *block;
1917
1918         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1919             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1920
1921             block->bmap = bitmap_new(pages);
1922             bitmap_set(block->bmap, 0, pages);
1923             if (migrate_postcopy_ram()) {
1924                 block->unsentmap = bitmap_new(pages);
1925                 bitmap_set(block->unsentmap, 0, pages);
1926             }
1927         }
1928     }
1929
1930     /*
1931      * Count the total number of pages used by ram blocks not including any
1932      * gaps due to alignment or unplugs.
1933      */
1934     rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1935
1936     memory_global_dirty_log_start();
1937     migration_bitmap_sync(rs);
1938     qemu_mutex_unlock_ramlist();
1939     qemu_mutex_unlock_iothread();
1940     rcu_read_unlock();
1941
1942     return 0;
1943 }
1944
1945 /*
1946  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1947  * long-running RCU critical section.  When rcu-reclaims in the code
1948  * start to become numerous it will be necessary to reduce the
1949  * granularity of these critical sections.
1950  */
1951
1952 /**
1953  * ram_save_setup: Setup RAM for migration
1954  *
1955  * Returns zero to indicate success and negative for error
1956  *
1957  * @f: QEMUFile where to send the data
1958  * @opaque: RAMState pointer
1959  */
1960 static int ram_save_setup(QEMUFile *f, void *opaque)
1961 {
1962     RAMState *rs = opaque;
1963     RAMBlock *block;
1964
1965     /* migration has already setup the bitmap, reuse it. */
1966     if (!migration_in_colo_state()) {
1967         if (ram_state_init(rs) < 0) {
1968             return -1;
1969          }
1970     }
1971     rs->f = f;
1972
1973     rcu_read_lock();
1974
1975     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1976
1977     RAMBLOCK_FOREACH(block) {
1978         qemu_put_byte(f, strlen(block->idstr));
1979         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1980         qemu_put_be64(f, block->used_length);
1981         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1982             qemu_put_be64(f, block->page_size);
1983         }
1984     }
1985
1986     rcu_read_unlock();
1987
1988     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1989     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1990
1991     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1992
1993     return 0;
1994 }
1995
1996 /**
1997  * ram_save_iterate: iterative stage for migration
1998  *
1999  * Returns zero to indicate success and negative for error
2000  *
2001  * @f: QEMUFile where to send the data
2002  * @opaque: RAMState pointer
2003  */
2004 static int ram_save_iterate(QEMUFile *f, void *opaque)
2005 {
2006     RAMState *rs = opaque;
2007     int ret;
2008     int i;
2009     int64_t t0;
2010     int done = 0;
2011
2012     rcu_read_lock();
2013     if (ram_list.version != rs->last_version) {
2014         ram_state_reset(rs);
2015     }
2016
2017     /* Read version before ram_list.blocks */
2018     smp_rmb();
2019
2020     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2021
2022     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2023     i = 0;
2024     while ((ret = qemu_file_rate_limit(f)) == 0) {
2025         int pages;
2026
2027         pages = ram_find_and_save_block(rs, false);
2028         /* no more pages to sent */
2029         if (pages == 0) {
2030             done = 1;
2031             break;
2032         }
2033         rs->iterations++;
2034
2035         /* we want to check in the 1st loop, just in case it was the 1st time
2036            and we had to sync the dirty bitmap.
2037            qemu_get_clock_ns() is a bit expensive, so we only check each some
2038            iterations
2039         */
2040         if ((i & 63) == 0) {
2041             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2042             if (t1 > MAX_WAIT) {
2043                 trace_ram_save_iterate_big_wait(t1, i);
2044                 break;
2045             }
2046         }
2047         i++;
2048     }
2049     flush_compressed_data(rs);
2050     rcu_read_unlock();
2051
2052     /*
2053      * Must occur before EOS (or any QEMUFile operation)
2054      * because of RDMA protocol.
2055      */
2056     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2057
2058     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2059     rs->bytes_transferred += 8;
2060
2061     ret = qemu_file_get_error(f);
2062     if (ret < 0) {
2063         return ret;
2064     }
2065
2066     return done;
2067 }
2068
2069 /**
2070  * ram_save_complete: function called to send the remaining amount of ram
2071  *
2072  * Returns zero to indicate success
2073  *
2074  * Called with iothread lock
2075  *
2076  * @f: QEMUFile where to send the data
2077  * @opaque: RAMState pointer
2078  */
2079 static int ram_save_complete(QEMUFile *f, void *opaque)
2080 {
2081     RAMState *rs = opaque;
2082
2083     rcu_read_lock();
2084
2085     if (!migration_in_postcopy()) {
2086         migration_bitmap_sync(rs);
2087     }
2088
2089     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2090
2091     /* try transferring iterative blocks of memory */
2092
2093     /* flush all remaining blocks regardless of rate limiting */
2094     while (true) {
2095         int pages;
2096
2097         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2098         /* no more blocks to sent */
2099         if (pages == 0) {
2100             break;
2101         }
2102     }
2103
2104     flush_compressed_data(rs);
2105     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2106
2107     rcu_read_unlock();
2108
2109     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2110
2111     return 0;
2112 }
2113
2114 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2115                              uint64_t *non_postcopiable_pending,
2116                              uint64_t *postcopiable_pending)
2117 {
2118     RAMState *rs = opaque;
2119     uint64_t remaining_size;
2120
2121     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2122
2123     if (!migration_in_postcopy() &&
2124         remaining_size < max_size) {
2125         qemu_mutex_lock_iothread();
2126         rcu_read_lock();
2127         migration_bitmap_sync(rs);
2128         rcu_read_unlock();
2129         qemu_mutex_unlock_iothread();
2130         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2131     }
2132
2133     /* We can do postcopy, and all the data is postcopiable */
2134     *postcopiable_pending += remaining_size;
2135 }
2136
2137 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2138 {
2139     unsigned int xh_len;
2140     int xh_flags;
2141     uint8_t *loaded_data;
2142
2143     if (!xbzrle_decoded_buf) {
2144         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2145     }
2146     loaded_data = xbzrle_decoded_buf;
2147
2148     /* extract RLE header */
2149     xh_flags = qemu_get_byte(f);
2150     xh_len = qemu_get_be16(f);
2151
2152     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2153         error_report("Failed to load XBZRLE page - wrong compression!");
2154         return -1;
2155     }
2156
2157     if (xh_len > TARGET_PAGE_SIZE) {
2158         error_report("Failed to load XBZRLE page - len overflow!");
2159         return -1;
2160     }
2161     /* load data and decode */
2162     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2163
2164     /* decode RLE */
2165     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2166                              TARGET_PAGE_SIZE) == -1) {
2167         error_report("Failed to load XBZRLE page - decode error!");
2168         return -1;
2169     }
2170
2171     return 0;
2172 }
2173
2174 /**
2175  * ram_block_from_stream: read a RAMBlock id from the migration stream
2176  *
2177  * Must be called from within a rcu critical section.
2178  *
2179  * Returns a pointer from within the RCU-protected ram_list.
2180  *
2181  * @f: QEMUFile where to read the data from
2182  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2183  */
2184 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2185 {
2186     static RAMBlock *block = NULL;
2187     char id[256];
2188     uint8_t len;
2189
2190     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2191         if (!block) {
2192             error_report("Ack, bad migration stream!");
2193             return NULL;
2194         }
2195         return block;
2196     }
2197
2198     len = qemu_get_byte(f);
2199     qemu_get_buffer(f, (uint8_t *)id, len);
2200     id[len] = 0;
2201
2202     block = qemu_ram_block_by_name(id);
2203     if (!block) {
2204         error_report("Can't find block %s", id);
2205         return NULL;
2206     }
2207
2208     return block;
2209 }
2210
2211 static inline void *host_from_ram_block_offset(RAMBlock *block,
2212                                                ram_addr_t offset)
2213 {
2214     if (!offset_in_ramblock(block, offset)) {
2215         return NULL;
2216     }
2217
2218     return block->host + offset;
2219 }
2220
2221 /**
2222  * ram_handle_compressed: handle the zero page case
2223  *
2224  * If a page (or a whole RDMA chunk) has been
2225  * determined to be zero, then zap it.
2226  *
2227  * @host: host address for the zero page
2228  * @ch: what the page is filled from.  We only support zero
2229  * @size: size of the zero page
2230  */
2231 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2232 {
2233     if (ch != 0 || !is_zero_range(host, size)) {
2234         memset(host, ch, size);
2235     }
2236 }
2237
2238 static void *do_data_decompress(void *opaque)
2239 {
2240     DecompressParam *param = opaque;
2241     unsigned long pagesize;
2242     uint8_t *des;
2243     int len;
2244
2245     qemu_mutex_lock(&param->mutex);
2246     while (!param->quit) {
2247         if (param->des) {
2248             des = param->des;
2249             len = param->len;
2250             param->des = 0;
2251             qemu_mutex_unlock(&param->mutex);
2252
2253             pagesize = TARGET_PAGE_SIZE;
2254             /* uncompress() will return failed in some case, especially
2255              * when the page is dirted when doing the compression, it's
2256              * not a problem because the dirty page will be retransferred
2257              * and uncompress() won't break the data in other pages.
2258              */
2259             uncompress((Bytef *)des, &pagesize,
2260                        (const Bytef *)param->compbuf, len);
2261
2262             qemu_mutex_lock(&decomp_done_lock);
2263             param->done = true;
2264             qemu_cond_signal(&decomp_done_cond);
2265             qemu_mutex_unlock(&decomp_done_lock);
2266
2267             qemu_mutex_lock(&param->mutex);
2268         } else {
2269             qemu_cond_wait(&param->cond, &param->mutex);
2270         }
2271     }
2272     qemu_mutex_unlock(&param->mutex);
2273
2274     return NULL;
2275 }
2276
2277 static void wait_for_decompress_done(void)
2278 {
2279     int idx, thread_count;
2280
2281     if (!migrate_use_compression()) {
2282         return;
2283     }
2284
2285     thread_count = migrate_decompress_threads();
2286     qemu_mutex_lock(&decomp_done_lock);
2287     for (idx = 0; idx < thread_count; idx++) {
2288         while (!decomp_param[idx].done) {
2289             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2290         }
2291     }
2292     qemu_mutex_unlock(&decomp_done_lock);
2293 }
2294
2295 void migrate_decompress_threads_create(void)
2296 {
2297     int i, thread_count;
2298
2299     thread_count = migrate_decompress_threads();
2300     decompress_threads = g_new0(QemuThread, thread_count);
2301     decomp_param = g_new0(DecompressParam, thread_count);
2302     qemu_mutex_init(&decomp_done_lock);
2303     qemu_cond_init(&decomp_done_cond);
2304     for (i = 0; i < thread_count; i++) {
2305         qemu_mutex_init(&decomp_param[i].mutex);
2306         qemu_cond_init(&decomp_param[i].cond);
2307         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2308         decomp_param[i].done = true;
2309         decomp_param[i].quit = false;
2310         qemu_thread_create(decompress_threads + i, "decompress",
2311                            do_data_decompress, decomp_param + i,
2312                            QEMU_THREAD_JOINABLE);
2313     }
2314 }
2315
2316 void migrate_decompress_threads_join(void)
2317 {
2318     int i, thread_count;
2319
2320     thread_count = migrate_decompress_threads();
2321     for (i = 0; i < thread_count; i++) {
2322         qemu_mutex_lock(&decomp_param[i].mutex);
2323         decomp_param[i].quit = true;
2324         qemu_cond_signal(&decomp_param[i].cond);
2325         qemu_mutex_unlock(&decomp_param[i].mutex);
2326     }
2327     for (i = 0; i < thread_count; i++) {
2328         qemu_thread_join(decompress_threads + i);
2329         qemu_mutex_destroy(&decomp_param[i].mutex);
2330         qemu_cond_destroy(&decomp_param[i].cond);
2331         g_free(decomp_param[i].compbuf);
2332     }
2333     g_free(decompress_threads);
2334     g_free(decomp_param);
2335     decompress_threads = NULL;
2336     decomp_param = NULL;
2337 }
2338
2339 static void decompress_data_with_multi_threads(QEMUFile *f,
2340                                                void *host, int len)
2341 {
2342     int idx, thread_count;
2343
2344     thread_count = migrate_decompress_threads();
2345     qemu_mutex_lock(&decomp_done_lock);
2346     while (true) {
2347         for (idx = 0; idx < thread_count; idx++) {
2348             if (decomp_param[idx].done) {
2349                 decomp_param[idx].done = false;
2350                 qemu_mutex_lock(&decomp_param[idx].mutex);
2351                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2352                 decomp_param[idx].des = host;
2353                 decomp_param[idx].len = len;
2354                 qemu_cond_signal(&decomp_param[idx].cond);
2355                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2356                 break;
2357             }
2358         }
2359         if (idx < thread_count) {
2360             break;
2361         } else {
2362             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2363         }
2364     }
2365     qemu_mutex_unlock(&decomp_done_lock);
2366 }
2367
2368 /**
2369  * ram_postcopy_incoming_init: allocate postcopy data structures
2370  *
2371  * Returns 0 for success and negative if there was one error
2372  *
2373  * @mis: current migration incoming state
2374  *
2375  * Allocate data structures etc needed by incoming migration with
2376  * postcopy-ram. postcopy-ram's similarly names
2377  * postcopy_ram_incoming_init does the work.
2378  */
2379 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2380 {
2381     unsigned long ram_pages = last_ram_page();
2382
2383     return postcopy_ram_incoming_init(mis, ram_pages);
2384 }
2385
2386 /**
2387  * ram_load_postcopy: load a page in postcopy case
2388  *
2389  * Returns 0 for success or -errno in case of error
2390  *
2391  * Called in postcopy mode by ram_load().
2392  * rcu_read_lock is taken prior to this being called.
2393  *
2394  * @f: QEMUFile where to send the data
2395  */
2396 static int ram_load_postcopy(QEMUFile *f)
2397 {
2398     int flags = 0, ret = 0;
2399     bool place_needed = false;
2400     bool matching_page_sizes = false;
2401     MigrationIncomingState *mis = migration_incoming_get_current();
2402     /* Temporary page that is later 'placed' */
2403     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2404     void *last_host = NULL;
2405     bool all_zero = false;
2406
2407     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2408         ram_addr_t addr;
2409         void *host = NULL;
2410         void *page_buffer = NULL;
2411         void *place_source = NULL;
2412         RAMBlock *block = NULL;
2413         uint8_t ch;
2414
2415         addr = qemu_get_be64(f);
2416         flags = addr & ~TARGET_PAGE_MASK;
2417         addr &= TARGET_PAGE_MASK;
2418
2419         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2420         place_needed = false;
2421         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2422             block = ram_block_from_stream(f, flags);
2423
2424             host = host_from_ram_block_offset(block, addr);
2425             if (!host) {
2426                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2427                 ret = -EINVAL;
2428                 break;
2429             }
2430             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2431             /*
2432              * Postcopy requires that we place whole host pages atomically;
2433              * these may be huge pages for RAMBlocks that are backed by
2434              * hugetlbfs.
2435              * To make it atomic, the data is read into a temporary page
2436              * that's moved into place later.
2437              * The migration protocol uses,  possibly smaller, target-pages
2438              * however the source ensures it always sends all the components
2439              * of a host page in order.
2440              */
2441             page_buffer = postcopy_host_page +
2442                           ((uintptr_t)host & (block->page_size - 1));
2443             /* If all TP are zero then we can optimise the place */
2444             if (!((uintptr_t)host & (block->page_size - 1))) {
2445                 all_zero = true;
2446             } else {
2447                 /* not the 1st TP within the HP */
2448                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2449                     error_report("Non-sequential target page %p/%p",
2450                                   host, last_host);
2451                     ret = -EINVAL;
2452                     break;
2453                 }
2454             }
2455
2456
2457             /*
2458              * If it's the last part of a host page then we place the host
2459              * page
2460              */
2461             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2462                                      (block->page_size - 1)) == 0;
2463             place_source = postcopy_host_page;
2464         }
2465         last_host = host;
2466
2467         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2468         case RAM_SAVE_FLAG_ZERO:
2469             ch = qemu_get_byte(f);
2470             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2471             if (ch) {
2472                 all_zero = false;
2473             }
2474             break;
2475
2476         case RAM_SAVE_FLAG_PAGE:
2477             all_zero = false;
2478             if (!place_needed || !matching_page_sizes) {
2479                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2480             } else {
2481                 /* Avoids the qemu_file copy during postcopy, which is
2482                  * going to do a copy later; can only do it when we
2483                  * do this read in one go (matching page sizes)
2484                  */
2485                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2486                                          TARGET_PAGE_SIZE);
2487             }
2488             break;
2489         case RAM_SAVE_FLAG_EOS:
2490             /* normal exit */
2491             break;
2492         default:
2493             error_report("Unknown combination of migration flags: %#x"
2494                          " (postcopy mode)", flags);
2495             ret = -EINVAL;
2496         }
2497
2498         if (place_needed) {
2499             /* This gets called at the last target page in the host page */
2500             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2501
2502             if (all_zero) {
2503                 ret = postcopy_place_page_zero(mis, place_dest,
2504                                                block->page_size);
2505             } else {
2506                 ret = postcopy_place_page(mis, place_dest,
2507                                           place_source, block->page_size);
2508             }
2509         }
2510         if (!ret) {
2511             ret = qemu_file_get_error(f);
2512         }
2513     }
2514
2515     return ret;
2516 }
2517
2518 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2519 {
2520     int flags = 0, ret = 0;
2521     static uint64_t seq_iter;
2522     int len = 0;
2523     /*
2524      * If system is running in postcopy mode, page inserts to host memory must
2525      * be atomic
2526      */
2527     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2528     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2529     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2530
2531     seq_iter++;
2532
2533     if (version_id != 4) {
2534         ret = -EINVAL;
2535     }
2536
2537     /* This RCU critical section can be very long running.
2538      * When RCU reclaims in the code start to become numerous,
2539      * it will be necessary to reduce the granularity of this
2540      * critical section.
2541      */
2542     rcu_read_lock();
2543
2544     if (postcopy_running) {
2545         ret = ram_load_postcopy(f);
2546     }
2547
2548     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2549         ram_addr_t addr, total_ram_bytes;
2550         void *host = NULL;
2551         uint8_t ch;
2552
2553         addr = qemu_get_be64(f);
2554         flags = addr & ~TARGET_PAGE_MASK;
2555         addr &= TARGET_PAGE_MASK;
2556
2557         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2558                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2559             RAMBlock *block = ram_block_from_stream(f, flags);
2560
2561             host = host_from_ram_block_offset(block, addr);
2562             if (!host) {
2563                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2564                 ret = -EINVAL;
2565                 break;
2566             }
2567             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2568         }
2569
2570         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2571         case RAM_SAVE_FLAG_MEM_SIZE:
2572             /* Synchronize RAM block list */
2573             total_ram_bytes = addr;
2574             while (!ret && total_ram_bytes) {
2575                 RAMBlock *block;
2576                 char id[256];
2577                 ram_addr_t length;
2578
2579                 len = qemu_get_byte(f);
2580                 qemu_get_buffer(f, (uint8_t *)id, len);
2581                 id[len] = 0;
2582                 length = qemu_get_be64(f);
2583
2584                 block = qemu_ram_block_by_name(id);
2585                 if (block) {
2586                     if (length != block->used_length) {
2587                         Error *local_err = NULL;
2588
2589                         ret = qemu_ram_resize(block, length,
2590                                               &local_err);
2591                         if (local_err) {
2592                             error_report_err(local_err);
2593                         }
2594                     }
2595                     /* For postcopy we need to check hugepage sizes match */
2596                     if (postcopy_advised &&
2597                         block->page_size != qemu_host_page_size) {
2598                         uint64_t remote_page_size = qemu_get_be64(f);
2599                         if (remote_page_size != block->page_size) {
2600                             error_report("Mismatched RAM page size %s "
2601                                          "(local) %zd != %" PRId64,
2602                                          id, block->page_size,
2603                                          remote_page_size);
2604                             ret = -EINVAL;
2605                         }
2606                     }
2607                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2608                                           block->idstr);
2609                 } else {
2610                     error_report("Unknown ramblock \"%s\", cannot "
2611                                  "accept migration", id);
2612                     ret = -EINVAL;
2613                 }
2614
2615                 total_ram_bytes -= length;
2616             }
2617             break;
2618
2619         case RAM_SAVE_FLAG_ZERO:
2620             ch = qemu_get_byte(f);
2621             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2622             break;
2623
2624         case RAM_SAVE_FLAG_PAGE:
2625             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2626             break;
2627
2628         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2629             len = qemu_get_be32(f);
2630             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2631                 error_report("Invalid compressed data length: %d", len);
2632                 ret = -EINVAL;
2633                 break;
2634             }
2635             decompress_data_with_multi_threads(f, host, len);
2636             break;
2637
2638         case RAM_SAVE_FLAG_XBZRLE:
2639             if (load_xbzrle(f, addr, host) < 0) {
2640                 error_report("Failed to decompress XBZRLE page at "
2641                              RAM_ADDR_FMT, addr);
2642                 ret = -EINVAL;
2643                 break;
2644             }
2645             break;
2646         case RAM_SAVE_FLAG_EOS:
2647             /* normal exit */
2648             break;
2649         default:
2650             if (flags & RAM_SAVE_FLAG_HOOK) {
2651                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2652             } else {
2653                 error_report("Unknown combination of migration flags: %#x",
2654                              flags);
2655                 ret = -EINVAL;
2656             }
2657         }
2658         if (!ret) {
2659             ret = qemu_file_get_error(f);
2660         }
2661     }
2662
2663     wait_for_decompress_done();
2664     rcu_read_unlock();
2665     trace_ram_load_complete(ret, seq_iter);
2666     return ret;
2667 }
2668
2669 static SaveVMHandlers savevm_ram_handlers = {
2670     .save_live_setup = ram_save_setup,
2671     .save_live_iterate = ram_save_iterate,
2672     .save_live_complete_postcopy = ram_save_complete,
2673     .save_live_complete_precopy = ram_save_complete,
2674     .save_live_pending = ram_save_pending,
2675     .load_state = ram_load,
2676     .cleanup = ram_migration_cleanup,
2677 };
2678
2679 void ram_mig_init(void)
2680 {
2681     qemu_mutex_init(&XBZRLE.lock);
2682     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2683 }