migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <[email protected]>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46 #include "migration/colo.h"
  47
  48 /***********************************************************/
  49 /* ram save/restore */
  50
  51 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  52 #define RAM_SAVE_FLAG_COMPRESS 0x02
  53 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  54 #define RAM_SAVE_FLAG_PAGE     0x08
  55 #define RAM_SAVE_FLAG_EOS      0x10
  56 #define RAM_SAVE_FLAG_CONTINUE 0x20
  57 #define RAM_SAVE_FLAG_XBZRLE   0x40
  58 /* 0x80 is reserved in migration.h start with 0x100 next */
  59 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  60
  61 static uint8_t *ZERO_TARGET_PAGE;
  62
  63 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  64 {
  65     return buffer_is_zero(p, size);
  66 }
  67
  68 /* struct contains XBZRLE cache and a static page
  69    used by the compression */
  70 static struct {
  71     /* buffer used for XBZRLE encoding */
  72     uint8_t *encoded_buf;
  73     /* buffer for storing page content */
  74     uint8_t *current_buf;
  75     /* Cache for XBZRLE, Protected by lock. */
  76     PageCache *cache;
  77     QemuMutex lock;
  78 } XBZRLE;
  79
  80 /* buffer used for XBZRLE decoding */
  81 static uint8_t *xbzrle_decoded_buf;
  82
  83 static void XBZRLE_cache_lock(void)
  84 {
  85     if (migrate_use_xbzrle())
  86         qemu_mutex_lock(&XBZRLE.lock);
  87 }
  88
  89 static void XBZRLE_cache_unlock(void)
  90 {
  91     if (migrate_use_xbzrle())
  92         qemu_mutex_unlock(&XBZRLE.lock);
  93 }
  94
  95 /**
  96  * xbzrle_cache_resize: resize the xbzrle cache
  97  *
  98  * This function is called from qmp_migrate_set_cache_size in main
  99  * thread, possibly while a migration is in progress.  A running
 100  * migration may be using the cache and might finish during this call,
 101  * hence changes to the cache are protected by XBZRLE.lock().
 102  *
 103  * Returns the new_size or negative in case of error.
 104  *
 105  * @new_size: new cache size
 106  */
 107 int64_t xbzrle_cache_resize(int64_t new_size)
 108 {
 109     PageCache *new_cache;
 110     int64_t ret;
 111
 112     if (new_size < TARGET_PAGE_SIZE) {
 113         return -1;
 114     }
 115
 116     XBZRLE_cache_lock();
 117
 118     if (XBZRLE.cache != NULL) {
 119         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 120             goto out_new_size;
 121         }
 122         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 123                                         TARGET_PAGE_SIZE);
 124         if (!new_cache) {
 125             error_report("Error creating cache");
 126             ret = -1;
 127             goto out;
 128         }
 129
 130         cache_fini(XBZRLE.cache);
 131         XBZRLE.cache = new_cache;
 132     }
 133
 134 out_new_size:
 135     ret = pow2floor(new_size);
 136 out:
 137     XBZRLE_cache_unlock();
 138     return ret;
 139 }
 140
 141 /*
 142  * An outstanding page request, on the source, having been received
 143  * and queued
 144  */
 145 struct RAMSrcPageRequest {
 146     RAMBlock *rb;
 147     hwaddr    offset;
 148     hwaddr    len;
 149
 150     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 151 };
 152
 153 /* State of RAM for migration */
 154 struct RAMState {
 155     /* QEMUFile used for this migration */
 156     QEMUFile *f;
 157     /* Last block that we have visited searching for dirty pages */
 158     RAMBlock *last_seen_block;
 159     /* Last block from where we have sent data */
 160     RAMBlock *last_sent_block;
 161     /* Last dirty target page we have sent */
 162     ram_addr_t last_page;
 163     /* last ram version we have seen */
 164     uint32_t last_version;
 165     /* We are in the first round */
 166     bool ram_bulk_stage;
 167     /* How many times we have dirty too many pages */
 168     int dirty_rate_high_cnt;
 169     /* How many times we have synchronized the bitmap */
 170     uint64_t bitmap_sync_count;
 171     /* these variables are used for bitmap sync */
 172     /* last time we did a full bitmap_sync */
 173     int64_t time_last_bitmap_sync;
 174     /* bytes transferred at start_time */
 175     uint64_t bytes_xfer_prev;
 176     /* number of dirty pages since start_time */
 177     uint64_t num_dirty_pages_period;
 178     /* xbzrle misses since the beginning of the period */
 179     uint64_t xbzrle_cache_miss_prev;
 180     /* number of iterations at the beginning of period */
 181     uint64_t iterations_prev;
 182     /* Accounting fields */
 183     /* number of zero pages.  It used to be pages filled by the same char. */
 184     uint64_t zero_pages;
 185     /* number of normal transferred pages */
 186     uint64_t norm_pages;
 187     /* Iterations since start */
 188     uint64_t iterations;
 189     /* xbzrle transmitted bytes.  Notice that this is with
 190      * compression, they can't be calculated from the pages */
 191     uint64_t xbzrle_bytes;
 192     /* xbzrle transmmited pages */
 193     uint64_t xbzrle_pages;
 194     /* xbzrle number of cache miss */
 195     uint64_t xbzrle_cache_miss;
 196     /* xbzrle miss rate */
 197     double xbzrle_cache_miss_rate;
 198     /* xbzrle number of overflows */
 199     uint64_t xbzrle_overflows;
 200     /* number of dirty bits in the bitmap */
 201     uint64_t migration_dirty_pages;
 202     /* total number of bytes transferred */
 203     uint64_t bytes_transferred;
 204     /* number of dirtied pages in the last second */
 205     uint64_t dirty_pages_rate;
 206     /* Count of requests incoming from destination */
 207     uint64_t postcopy_requests;
 208     /* protects modification of the bitmap */
 209     QemuMutex bitmap_mutex;
 210     /* The RAMBlock used in the last src_page_requests */
 211     RAMBlock *last_req_rb;
 212     /* Queue of outstanding page requests from the destination */
 213     QemuMutex src_page_req_mutex;
 214     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 215 };
 216 typedef struct RAMState RAMState;
 217
 218 static RAMState ram_state;
 219
 220 uint64_t dup_mig_pages_transferred(void)
 221 {
 222     return ram_state.zero_pages;
 223 }
 224
 225 uint64_t norm_mig_pages_transferred(void)
 226 {
 227     return ram_state.norm_pages;
 228 }
 229
 230 uint64_t xbzrle_mig_bytes_transferred(void)
 231 {
 232     return ram_state.xbzrle_bytes;
 233 }
 234
 235 uint64_t xbzrle_mig_pages_transferred(void)
 236 {
 237     return ram_state.xbzrle_pages;
 238 }
 239
 240 uint64_t xbzrle_mig_pages_cache_miss(void)
 241 {
 242     return ram_state.xbzrle_cache_miss;
 243 }
 244
 245 double xbzrle_mig_cache_miss_rate(void)
 246 {
 247     return ram_state.xbzrle_cache_miss_rate;
 248 }
 249
 250 uint64_t xbzrle_mig_pages_overflow(void)
 251 {
 252     return ram_state.xbzrle_overflows;
 253 }
 254
 255 uint64_t ram_bytes_transferred(void)
 256 {
 257     return ram_state.bytes_transferred;
 258 }
 259
 260 uint64_t ram_bytes_remaining(void)
 261 {
 262     return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
 263 }
 264
 265 uint64_t ram_dirty_sync_count(void)
 266 {
 267     return ram_state.bitmap_sync_count;
 268 }
 269
 270 uint64_t ram_dirty_pages_rate(void)
 271 {
 272     return ram_state.dirty_pages_rate;
 273 }
 274
 275 uint64_t ram_postcopy_requests(void)
 276 {
 277     return ram_state.postcopy_requests;
 278 }
 279
 280 /* used by the search for pages to send */
 281 struct PageSearchStatus {
 282     /* Current block being searched */
 283     RAMBlock    *block;
 284     /* Current page to search from */
 285     unsigned long page;
 286     /* Set once we wrap around */
 287     bool         complete_round;
 288 };
 289 typedef struct PageSearchStatus PageSearchStatus;
 290
 291 struct CompressParam {
 292     bool done;
 293     bool quit;
 294     QEMUFile *file;
 295     QemuMutex mutex;
 296     QemuCond cond;
 297     RAMBlock *block;
 298     ram_addr_t offset;
 299 };
 300 typedef struct CompressParam CompressParam;
 301
 302 struct DecompressParam {
 303     bool done;
 304     bool quit;
 305     QemuMutex mutex;
 306     QemuCond cond;
 307     void *des;
 308     uint8_t *compbuf;
 309     int len;
 310 };
 311 typedef struct DecompressParam DecompressParam;
 312
 313 static CompressParam *comp_param;
 314 static QemuThread *compress_threads;
 315 /* comp_done_cond is used to wake up the migration thread when
 316  * one of the compression threads has finished the compression.
 317  * comp_done_lock is used to co-work with comp_done_cond.
 318  */
 319 static QemuMutex comp_done_lock;
 320 static QemuCond comp_done_cond;
 321 /* The empty QEMUFileOps will be used by file in CompressParam */
 322 static const QEMUFileOps empty_ops = { };
 323
 324 static DecompressParam *decomp_param;
 325 static QemuThread *decompress_threads;
 326 static QemuMutex decomp_done_lock;
 327 static QemuCond decomp_done_cond;
 328
 329 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 330                                 ram_addr_t offset);
 331
 332 static void *do_data_compress(void *opaque)
 333 {
 334     CompressParam *param = opaque;
 335     RAMBlock *block;
 336     ram_addr_t offset;
 337
 338     qemu_mutex_lock(&param->mutex);
 339     while (!param->quit) {
 340         if (param->block) {
 341             block = param->block;
 342             offset = param->offset;
 343             param->block = NULL;
 344             qemu_mutex_unlock(&param->mutex);
 345
 346             do_compress_ram_page(param->file, block, offset);
 347
 348             qemu_mutex_lock(&comp_done_lock);
 349             param->done = true;
 350             qemu_cond_signal(&comp_done_cond);
 351             qemu_mutex_unlock(&comp_done_lock);
 352
 353             qemu_mutex_lock(&param->mutex);
 354         } else {
 355             qemu_cond_wait(&param->cond, &param->mutex);
 356         }
 357     }
 358     qemu_mutex_unlock(&param->mutex);
 359
 360     return NULL;
 361 }
 362
 363 static inline void terminate_compression_threads(void)
 364 {
 365     int idx, thread_count;
 366
 367     thread_count = migrate_compress_threads();
 368
 369     for (idx = 0; idx < thread_count; idx++) {
 370         qemu_mutex_lock(&comp_param[idx].mutex);
 371         comp_param[idx].quit = true;
 372         qemu_cond_signal(&comp_param[idx].cond);
 373         qemu_mutex_unlock(&comp_param[idx].mutex);
 374     }
 375 }
 376
 377 void migrate_compress_threads_join(void)
 378 {
 379     int i, thread_count;
 380
 381     if (!migrate_use_compression()) {
 382         return;
 383     }
 384     terminate_compression_threads();
 385     thread_count = migrate_compress_threads();
 386     for (i = 0; i < thread_count; i++) {
 387         qemu_thread_join(compress_threads + i);
 388         qemu_fclose(comp_param[i].file);
 389         qemu_mutex_destroy(&comp_param[i].mutex);
 390         qemu_cond_destroy(&comp_param[i].cond);
 391     }
 392     qemu_mutex_destroy(&comp_done_lock);
 393     qemu_cond_destroy(&comp_done_cond);
 394     g_free(compress_threads);
 395     g_free(comp_param);
 396     compress_threads = NULL;
 397     comp_param = NULL;
 398 }
 399
 400 void migrate_compress_threads_create(void)
 401 {
 402     int i, thread_count;
 403
 404     if (!migrate_use_compression()) {
 405         return;
 406     }
 407     thread_count = migrate_compress_threads();
 408     compress_threads = g_new0(QemuThread, thread_count);
 409     comp_param = g_new0(CompressParam, thread_count);
 410     qemu_cond_init(&comp_done_cond);
 411     qemu_mutex_init(&comp_done_lock);
 412     for (i = 0; i < thread_count; i++) {
 413         /* comp_param[i].file is just used as a dummy buffer to save data,
 414          * set its ops to empty.
 415          */
 416         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 417         comp_param[i].done = true;
 418         comp_param[i].quit = false;
 419         qemu_mutex_init(&comp_param[i].mutex);
 420         qemu_cond_init(&comp_param[i].cond);
 421         qemu_thread_create(compress_threads + i, "compress",
 422                            do_data_compress, comp_param + i,
 423                            QEMU_THREAD_JOINABLE);
 424     }
 425 }
 426
 427 /**
 428  * save_page_header: write page header to wire
 429  *
 430  * If this is the 1st block, it also writes the block identification
 431  *
 432  * Returns the number of bytes written
 433  *
 434  * @f: QEMUFile where to send the data
 435  * @block: block that contains the page we want to send
 436  * @offset: offset inside the block for the page
 437  *          in the lower bits, it contains flags
 438  */
 439 static size_t save_page_header(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 440 {
 441     size_t size, len;
 442
 443     if (block == rs->last_sent_block) {
 444         offset |= RAM_SAVE_FLAG_CONTINUE;
 445     }
 446     qemu_put_be64(rs->f, offset);
 447     size = 8;
 448
 449     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 450         len = strlen(block->idstr);
 451         qemu_put_byte(rs->f, len);
 452         qemu_put_buffer(rs->f, (uint8_t *)block->idstr, len);
 453         size += 1 + len;
 454         rs->last_sent_block = block;
 455     }
 456     return size;
 457 }
 458
 459 /**
 460  * mig_throttle_guest_down: throotle down the guest
 461  *
 462  * Reduce amount of guest cpu execution to hopefully slow down memory
 463  * writes. If guest dirty memory rate is reduced below the rate at
 464  * which we can transfer pages to the destination then we should be
 465  * able to complete migration. Some workloads dirty memory way too
 466  * fast and will not effectively converge, even with auto-converge.
 467  */
 468 static void mig_throttle_guest_down(void)
 469 {
 470     MigrationState *s = migrate_get_current();
 471     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 472     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 473
 474     /* We have not started throttling yet. Let's start it. */
 475     if (!cpu_throttle_active()) {
 476         cpu_throttle_set(pct_initial);
 477     } else {
 478         /* Throttling already on, just increase the rate */
 479         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 480     }
 481 }
 482
 483 /**
 484  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 485  *
 486  * @rs: current RAM state
 487  * @current_addr: address for the zero page
 488  *
 489  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 490  * The important thing is that a stale (not-yet-0'd) page be replaced
 491  * by the new data.
 492  * As a bonus, if the page wasn't in the cache it gets added so that
 493  * when a small write is made into the 0'd page it gets XBZRLE sent.
 494  */
 495 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 496 {
 497     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 498         return;
 499     }
 500
 501     /* We don't care if this fails to allocate a new cache page
 502      * as long as it updated an old one */
 503     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 504                  rs->bitmap_sync_count);
 505 }
 506
 507 #define ENCODING_FLAG_XBZRLE 0x1
 508
 509 /**
 510  * save_xbzrle_page: compress and send current page
 511  *
 512  * Returns: 1 means that we wrote the page
 513  *          0 means that page is identical to the one already sent
 514  *          -1 means that xbzrle would be longer than normal
 515  *
 516  * @rs: current RAM state
 517  * @current_data: pointer to the address of the page contents
 518  * @current_addr: addr of the page
 519  * @block: block that contains the page we want to send
 520  * @offset: offset inside the block for the page
 521  * @last_stage: if we are at the completion stage
 522  */
 523 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 524                             ram_addr_t current_addr, RAMBlock *block,
 525                             ram_addr_t offset, bool last_stage)
 526 {
 527     int encoded_len = 0, bytes_xbzrle;
 528     uint8_t *prev_cached_page;
 529
 530     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 531         rs->xbzrle_cache_miss++;
 532         if (!last_stage) {
 533             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 534                              rs->bitmap_sync_count) == -1) {
 535                 return -1;
 536             } else {
 537                 /* update *current_data when the page has been
 538                    inserted into cache */
 539                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 540             }
 541         }
 542         return -1;
 543     }
 544
 545     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 546
 547     /* save current buffer into memory */
 548     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 549
 550     /* XBZRLE encoding (if there is no overflow) */
 551     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 552                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 553                                        TARGET_PAGE_SIZE);
 554     if (encoded_len == 0) {
 555         trace_save_xbzrle_page_skipping();
 556         return 0;
 557     } else if (encoded_len == -1) {
 558         trace_save_xbzrle_page_overflow();
 559         rs->xbzrle_overflows++;
 560         /* update data in the cache */
 561         if (!last_stage) {
 562             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 563             *current_data = prev_cached_page;
 564         }
 565         return -1;
 566     }
 567
 568     /* we need to update the data in the cache, in order to get the same data */
 569     if (!last_stage) {
 570         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 571     }
 572
 573     /* Send XBZRLE based compressed page */
 574     bytes_xbzrle = save_page_header(rs, block,
 575                                     offset | RAM_SAVE_FLAG_XBZRLE);
 576     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 577     qemu_put_be16(rs->f, encoded_len);
 578     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 579     bytes_xbzrle += encoded_len + 1 + 2;
 580     rs->xbzrle_pages++;
 581     rs->xbzrle_bytes += bytes_xbzrle;
 582     rs->bytes_transferred += bytes_xbzrle;
 583
 584     return 1;
 585 }
 586
 587 /**
 588  * migration_bitmap_find_dirty: find the next dirty page from start
 589  *
 590  * Called with rcu_read_lock() to protect migration_bitmap
 591  *
 592  * Returns the byte offset within memory region of the start of a dirty page
 593  *
 594  * @rs: current RAM state
 595  * @rb: RAMBlock where to search for dirty pages
 596  * @start: page where we start the search
 597  */
 598 static inline
 599 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 600                                           unsigned long start)
 601 {
 602     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 603     unsigned long *bitmap = rb->bmap;
 604     unsigned long next;
 605
 606     if (rs->ram_bulk_stage && start > 0) {
 607         next = start + 1;
 608     } else {
 609         next = find_next_bit(bitmap, size, start);
 610     }
 611
 612     return next;
 613 }
 614
 615 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 616                                                 RAMBlock *rb,
 617                                                 unsigned long page)
 618 {
 619     bool ret;
 620
 621     ret = test_and_clear_bit(page, rb->bmap);
 622
 623     if (ret) {
 624         rs->migration_dirty_pages--;
 625     }
 626     return ret;
 627 }
 628
 629 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 630                                         ram_addr_t start, ram_addr_t length)
 631 {
 632     rs->migration_dirty_pages +=
 633         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 634                                               &rs->num_dirty_pages_period);
 635 }
 636
 637 /**
 638  * ram_pagesize_summary: calculate all the pagesizes of a VM
 639  *
 640  * Returns a summary bitmap of the page sizes of all RAMBlocks
 641  *
 642  * For VMs with just normal pages this is equivalent to the host page
 643  * size. If it's got some huge pages then it's the OR of all the
 644  * different page sizes.
 645  */
 646 uint64_t ram_pagesize_summary(void)
 647 {
 648     RAMBlock *block;
 649     uint64_t summary = 0;
 650
 651     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 652         summary |= block->page_size;
 653     }
 654
 655     return summary;
 656 }
 657
 658 static void migration_bitmap_sync(RAMState *rs)
 659 {
 660     RAMBlock *block;
 661     int64_t end_time;
 662     uint64_t bytes_xfer_now;
 663
 664     rs->bitmap_sync_count++;
 665
 666     if (!rs->bytes_xfer_prev) {
 667         rs->bytes_xfer_prev = ram_bytes_transferred();
 668     }
 669
 670     if (!rs->time_last_bitmap_sync) {
 671         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 672     }
 673
 674     trace_migration_bitmap_sync_start();
 675     memory_global_dirty_log_sync();
 676
 677     qemu_mutex_lock(&rs->bitmap_mutex);
 678     rcu_read_lock();
 679     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 680         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 681     }
 682     rcu_read_unlock();
 683     qemu_mutex_unlock(&rs->bitmap_mutex);
 684
 685     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 686
 687     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 688
 689     /* more than 1 second = 1000 millisecons */
 690     if (end_time > rs->time_last_bitmap_sync + 1000) {
 691         if (migrate_auto_converge()) {
 692             /* The following detection logic can be refined later. For now:
 693                Check to see if the dirtied bytes is 50% more than the approx.
 694                amount of bytes that just got transferred since the last time we
 695                were in this routine. If that happens twice, start or increase
 696                throttling */
 697             bytes_xfer_now = ram_bytes_transferred();
 698
 699             if (rs->dirty_pages_rate &&
 700                (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 701                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 702                (rs->dirty_rate_high_cnt++ >= 2)) {
 703                     trace_migration_throttle();
 704                     rs->dirty_rate_high_cnt = 0;
 705                     mig_throttle_guest_down();
 706              }
 707              rs->bytes_xfer_prev = bytes_xfer_now;
 708         }
 709
 710         if (migrate_use_xbzrle()) {
 711             if (rs->iterations_prev != rs->iterations) {
 712                 rs->xbzrle_cache_miss_rate =
 713                    (double)(rs->xbzrle_cache_miss -
 714                             rs->xbzrle_cache_miss_prev) /
 715                    (rs->iterations - rs->iterations_prev);
 716             }
 717             rs->iterations_prev = rs->iterations;
 718             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
 719         }
 720         rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 721             / (end_time - rs->time_last_bitmap_sync);
 722         rs->time_last_bitmap_sync = end_time;
 723         rs->num_dirty_pages_period = 0;
 724     }
 725     if (migrate_use_events()) {
 726         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 727     }
 728 }
 729
 730 /**
 731  * save_zero_page: send the zero page to the stream
 732  *
 733  * Returns the number of pages written.
 734  *
 735  * @rs: current RAM state
 736  * @block: block that contains the page we want to send
 737  * @offset: offset inside the block for the page
 738  * @p: pointer to the page
 739  */
 740 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 741                           uint8_t *p)
 742 {
 743     int pages = -1;
 744
 745     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 746         rs->zero_pages++;
 747         rs->bytes_transferred +=
 748             save_page_header(rs, block, offset | RAM_SAVE_FLAG_COMPRESS);
 749         qemu_put_byte(rs->f, 0);
 750         rs->bytes_transferred += 1;
 751         pages = 1;
 752     }
 753
 754     return pages;
 755 }
 756
 757 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 758 {
 759     if (!migrate_release_ram() || !migration_in_postcopy()) {
 760         return;
 761     }
 762
 763     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 764 }
 765
 766 /**
 767  * ram_save_page: send the given page to the stream
 768  *
 769  * Returns the number of pages written.
 770  *          < 0 - error
 771  *          >=0 - Number of pages written - this might legally be 0
 772  *                if xbzrle noticed the page was the same.
 773  *
 774  * @rs: current RAM state
 775  * @block: block that contains the page we want to send
 776  * @offset: offset inside the block for the page
 777  * @last_stage: if we are at the completion stage
 778  */
 779 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 780 {
 781     int pages = -1;
 782     uint64_t bytes_xmit;
 783     ram_addr_t current_addr;
 784     uint8_t *p;
 785     int ret;
 786     bool send_async = true;
 787     RAMBlock *block = pss->block;
 788     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 789
 790     p = block->host + offset;
 791
 792     /* In doubt sent page as normal */
 793     bytes_xmit = 0;
 794     ret = ram_control_save_page(rs->f, block->offset,
 795                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 796     if (bytes_xmit) {
 797         rs->bytes_transferred += bytes_xmit;
 798         pages = 1;
 799     }
 800
 801     XBZRLE_cache_lock();
 802
 803     current_addr = block->offset + offset;
 804
 805     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 806         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 807             if (bytes_xmit > 0) {
 808                 rs->norm_pages++;
 809             } else if (bytes_xmit == 0) {
 810                 rs->zero_pages++;
 811             }
 812         }
 813     } else {
 814         pages = save_zero_page(rs, block, offset, p);
 815         if (pages > 0) {
 816             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 817              * page would be stale
 818              */
 819             xbzrle_cache_zero_page(rs, current_addr);
 820             ram_release_pages(block->idstr, offset, pages);
 821         } else if (!rs->ram_bulk_stage &&
 822                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 823             pages = save_xbzrle_page(rs, &p, current_addr, block,
 824                                      offset, last_stage);
 825             if (!last_stage) {
 826                 /* Can't send this cached data async, since the cache page
 827                  * might get updated before it gets to the wire
 828                  */
 829                 send_async = false;
 830             }
 831         }
 832     }
 833
 834     /* XBZRLE overflow or normal page */
 835     if (pages == -1) {
 836         rs->bytes_transferred += save_page_header(rs, block,
 837                                                   offset | RAM_SAVE_FLAG_PAGE);
 838         if (send_async) {
 839             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 840                                   migrate_release_ram() &
 841                                   migration_in_postcopy());
 842         } else {
 843             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 844         }
 845         rs->bytes_transferred += TARGET_PAGE_SIZE;
 846         pages = 1;
 847         rs->norm_pages++;
 848     }
 849
 850     XBZRLE_cache_unlock();
 851
 852     return pages;
 853 }
 854
 855 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 856                                 ram_addr_t offset)
 857 {
 858     RAMState *rs = &ram_state;
 859     int bytes_sent, blen;
 860     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 861
 862     bytes_sent = save_page_header(rs, block, offset |
 863                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 864     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 865                                      migrate_compress_level());
 866     if (blen < 0) {
 867         bytes_sent = 0;
 868         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 869         error_report("compressed data failed!");
 870     } else {
 871         bytes_sent += blen;
 872         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 873     }
 874
 875     return bytes_sent;
 876 }
 877
 878 static void flush_compressed_data(RAMState *rs)
 879 {
 880     int idx, len, thread_count;
 881
 882     if (!migrate_use_compression()) {
 883         return;
 884     }
 885     thread_count = migrate_compress_threads();
 886
 887     qemu_mutex_lock(&comp_done_lock);
 888     for (idx = 0; idx < thread_count; idx++) {
 889         while (!comp_param[idx].done) {
 890             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 891         }
 892     }
 893     qemu_mutex_unlock(&comp_done_lock);
 894
 895     for (idx = 0; idx < thread_count; idx++) {
 896         qemu_mutex_lock(&comp_param[idx].mutex);
 897         if (!comp_param[idx].quit) {
 898             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 899             rs->bytes_transferred += len;
 900         }
 901         qemu_mutex_unlock(&comp_param[idx].mutex);
 902     }
 903 }
 904
 905 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 906                                        ram_addr_t offset)
 907 {
 908     param->block = block;
 909     param->offset = offset;
 910 }
 911
 912 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 913                                            ram_addr_t offset)
 914 {
 915     int idx, thread_count, bytes_xmit = -1, pages = -1;
 916
 917     thread_count = migrate_compress_threads();
 918     qemu_mutex_lock(&comp_done_lock);
 919     while (true) {
 920         for (idx = 0; idx < thread_count; idx++) {
 921             if (comp_param[idx].done) {
 922                 comp_param[idx].done = false;
 923                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 924                 qemu_mutex_lock(&comp_param[idx].mutex);
 925                 set_compress_params(&comp_param[idx], block, offset);
 926                 qemu_cond_signal(&comp_param[idx].cond);
 927                 qemu_mutex_unlock(&comp_param[idx].mutex);
 928                 pages = 1;
 929                 rs->norm_pages++;
 930                 rs->bytes_transferred += bytes_xmit;
 931                 break;
 932             }
 933         }
 934         if (pages > 0) {
 935             break;
 936         } else {
 937             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 938         }
 939     }
 940     qemu_mutex_unlock(&comp_done_lock);
 941
 942     return pages;
 943 }
 944
 945 /**
 946  * ram_save_compressed_page: compress the given page and send it to the stream
 947  *
 948  * Returns the number of pages written.
 949  *
 950  * @rs: current RAM state
 951  * @block: block that contains the page we want to send
 952  * @offset: offset inside the block for the page
 953  * @last_stage: if we are at the completion stage
 954  */
 955 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 956                                     bool last_stage)
 957 {
 958     int pages = -1;
 959     uint64_t bytes_xmit = 0;
 960     uint8_t *p;
 961     int ret, blen;
 962     RAMBlock *block = pss->block;
 963     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 964
 965     p = block->host + offset;
 966
 967     ret = ram_control_save_page(rs->f, block->offset,
 968                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 969     if (bytes_xmit) {
 970         rs->bytes_transferred += bytes_xmit;
 971         pages = 1;
 972     }
 973     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 974         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 975             if (bytes_xmit > 0) {
 976                 rs->norm_pages++;
 977             } else if (bytes_xmit == 0) {
 978                 rs->zero_pages++;
 979             }
 980         }
 981     } else {
 982         /* When starting the process of a new block, the first page of
 983          * the block should be sent out before other pages in the same
 984          * block, and all the pages in last block should have been sent
 985          * out, keeping this order is important, because the 'cont' flag
 986          * is used to avoid resending the block name.
 987          */
 988         if (block != rs->last_sent_block) {
 989             flush_compressed_data(rs);
 990             pages = save_zero_page(rs, block, offset, p);
 991             if (pages == -1) {
 992                 /* Make sure the first page is sent out before other pages */
 993                 bytes_xmit = save_page_header(rs, block, offset |
 994                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 995                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
 996                                                  migrate_compress_level());
 997                 if (blen > 0) {
 998                     rs->bytes_transferred += bytes_xmit + blen;
 999                     rs->norm_pages++;
1000                     pages = 1;
1001                 } else {
1002                     qemu_file_set_error(rs->f, blen);
1003                     error_report("compressed data failed!");
1004                 }
1005             }
1006             if (pages > 0) {
1007                 ram_release_pages(block->idstr, offset, pages);
1008             }
1009         } else {
1010             pages = save_zero_page(rs, block, offset, p);
1011             if (pages == -1) {
1012                 pages = compress_page_with_multi_thread(rs, block, offset);
1013             } else {
1014                 ram_release_pages(block->idstr, offset, pages);
1015             }
1016         }
1017     }
1018
1019     return pages;
1020 }
1021
1022 /**
1023  * find_dirty_block: find the next dirty page and update any state
1024  * associated with the search process.
1025  *
1026  * Returns if a page is found
1027  *
1028  * @rs: current RAM state
1029  * @pss: data about the state of the current dirty page scan
1030  * @again: set to false if the search has scanned the whole of RAM
1031  */
1032 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1033 {
1034     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1035     if (pss->complete_round && pss->block == rs->last_seen_block &&
1036         pss->page >= rs->last_page) {
1037         /*
1038          * We've been once around the RAM and haven't found anything.
1039          * Give up.
1040          */
1041         *again = false;
1042         return false;
1043     }
1044     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1045         /* Didn't find anything in this RAM Block */
1046         pss->page = 0;
1047         pss->block = QLIST_NEXT_RCU(pss->block, next);
1048         if (!pss->block) {
1049             /* Hit the end of the list */
1050             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1051             /* Flag that we've looped */
1052             pss->complete_round = true;
1053             rs->ram_bulk_stage = false;
1054             if (migrate_use_xbzrle()) {
1055                 /* If xbzrle is on, stop using the data compression at this
1056                  * point. In theory, xbzrle can do better than compression.
1057                  */
1058                 flush_compressed_data(rs);
1059             }
1060         }
1061         /* Didn't find anything this time, but try again on the new block */
1062         *again = true;
1063         return false;
1064     } else {
1065         /* Can go around again, but... */
1066         *again = true;
1067         /* We've found something so probably don't need to */
1068         return true;
1069     }
1070 }
1071
1072 /**
1073  * unqueue_page: gets a page of the queue
1074  *
1075  * Helper for 'get_queued_page' - gets a page off the queue
1076  *
1077  * Returns the block of the page (or NULL if none available)
1078  *
1079  * @rs: current RAM state
1080  * @offset: used to return the offset within the RAMBlock
1081  */
1082 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1083 {
1084     RAMBlock *block = NULL;
1085
1086     qemu_mutex_lock(&rs->src_page_req_mutex);
1087     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1088         struct RAMSrcPageRequest *entry =
1089                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1090         block = entry->rb;
1091         *offset = entry->offset;
1092
1093         if (entry->len > TARGET_PAGE_SIZE) {
1094             entry->len -= TARGET_PAGE_SIZE;
1095             entry->offset += TARGET_PAGE_SIZE;
1096         } else {
1097             memory_region_unref(block->mr);
1098             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1099             g_free(entry);
1100         }
1101     }
1102     qemu_mutex_unlock(&rs->src_page_req_mutex);
1103
1104     return block;
1105 }
1106
1107 /**
1108  * get_queued_page: unqueue a page from the postocpy requests
1109  *
1110  * Skips pages that are already sent (!dirty)
1111  *
1112  * Returns if a queued page is found
1113  *
1114  * @rs: current RAM state
1115  * @pss: data about the state of the current dirty page scan
1116  */
1117 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1118 {
1119     RAMBlock  *block;
1120     ram_addr_t offset;
1121     bool dirty;
1122
1123     do {
1124         block = unqueue_page(rs, &offset);
1125         /*
1126          * We're sending this page, and since it's postcopy nothing else
1127          * will dirty it, and we must make sure it doesn't get sent again
1128          * even if this queue request was received after the background
1129          * search already sent it.
1130          */
1131         if (block) {
1132             unsigned long page;
1133
1134             page = offset >> TARGET_PAGE_BITS;
1135             dirty = test_bit(page, block->bmap);
1136             if (!dirty) {
1137                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1138                        page, test_bit(page, block->unsentmap));
1139             } else {
1140                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1141             }
1142         }
1143
1144     } while (block && !dirty);
1145
1146     if (block) {
1147         /*
1148          * As soon as we start servicing pages out of order, then we have
1149          * to kill the bulk stage, since the bulk stage assumes
1150          * in (migration_bitmap_find_and_reset_dirty) that every page is
1151          * dirty, that's no longer true.
1152          */
1153         rs->ram_bulk_stage = false;
1154
1155         /*
1156          * We want the background search to continue from the queued page
1157          * since the guest is likely to want other pages near to the page
1158          * it just requested.
1159          */
1160         pss->block = block;
1161         pss->page = offset >> TARGET_PAGE_BITS;
1162     }
1163
1164     return !!block;
1165 }
1166
1167 /**
1168  * migration_page_queue_free: drop any remaining pages in the ram
1169  * request queue
1170  *
1171  * It should be empty at the end anyway, but in error cases there may
1172  * be some left.  in case that there is any page left, we drop it.
1173  *
1174  */
1175 void migration_page_queue_free(void)
1176 {
1177     struct RAMSrcPageRequest *mspr, *next_mspr;
1178     RAMState *rs = &ram_state;
1179     /* This queue generally should be empty - but in the case of a failed
1180      * migration might have some droppings in.
1181      */
1182     rcu_read_lock();
1183     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1184         memory_region_unref(mspr->rb->mr);
1185         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1186         g_free(mspr);
1187     }
1188     rcu_read_unlock();
1189 }
1190
1191 /**
1192  * ram_save_queue_pages: queue the page for transmission
1193  *
1194  * A request from postcopy destination for example.
1195  *
1196  * Returns zero on success or negative on error
1197  *
1198  * @rbname: Name of the RAMBLock of the request. NULL means the
1199  *          same that last one.
1200  * @start: starting address from the start of the RAMBlock
1201  * @len: length (in bytes) to send
1202  */
1203 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1204 {
1205     RAMBlock *ramblock;
1206     RAMState *rs = &ram_state;
1207
1208     rs->postcopy_requests++;
1209     rcu_read_lock();
1210     if (!rbname) {
1211         /* Reuse last RAMBlock */
1212         ramblock = rs->last_req_rb;
1213
1214         if (!ramblock) {
1215             /*
1216              * Shouldn't happen, we can't reuse the last RAMBlock if
1217              * it's the 1st request.
1218              */
1219             error_report("ram_save_queue_pages no previous block");
1220             goto err;
1221         }
1222     } else {
1223         ramblock = qemu_ram_block_by_name(rbname);
1224
1225         if (!ramblock) {
1226             /* We shouldn't be asked for a non-existent RAMBlock */
1227             error_report("ram_save_queue_pages no block '%s'", rbname);
1228             goto err;
1229         }
1230         rs->last_req_rb = ramblock;
1231     }
1232     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1233     if (start+len > ramblock->used_length) {
1234         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1235                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1236                      __func__, start, len, ramblock->used_length);
1237         goto err;
1238     }
1239
1240     struct RAMSrcPageRequest *new_entry =
1241         g_malloc0(sizeof(struct RAMSrcPageRequest));
1242     new_entry->rb = ramblock;
1243     new_entry->offset = start;
1244     new_entry->len = len;
1245
1246     memory_region_ref(ramblock->mr);
1247     qemu_mutex_lock(&rs->src_page_req_mutex);
1248     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1249     qemu_mutex_unlock(&rs->src_page_req_mutex);
1250     rcu_read_unlock();
1251
1252     return 0;
1253
1254 err:
1255     rcu_read_unlock();
1256     return -1;
1257 }
1258
1259 /**
1260  * ram_save_target_page: save one target page
1261  *
1262  * Returns the number of pages written
1263  *
1264  * @rs: current RAM state
1265  * @ms: current migration state
1266  * @pss: data about the page we want to send
1267  * @last_stage: if we are at the completion stage
1268  */
1269 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1270                                 bool last_stage)
1271 {
1272     int res = 0;
1273
1274     /* Check the pages is dirty and if it is send it */
1275     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1276         /*
1277          * If xbzrle is on, stop using the data compression after first
1278          * round of migration even if compression is enabled. In theory,
1279          * xbzrle can do better than compression.
1280          */
1281         if (migrate_use_compression() &&
1282             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1283             res = ram_save_compressed_page(rs, pss, last_stage);
1284         } else {
1285             res = ram_save_page(rs, pss, last_stage);
1286         }
1287
1288         if (res < 0) {
1289             return res;
1290         }
1291         if (pss->block->unsentmap) {
1292             clear_bit(pss->page, pss->block->unsentmap);
1293         }
1294     }
1295
1296     return res;
1297 }
1298
1299 /**
1300  * ram_save_host_page: save a whole host page
1301  *
1302  * Starting at *offset send pages up to the end of the current host
1303  * page. It's valid for the initial offset to point into the middle of
1304  * a host page in which case the remainder of the hostpage is sent.
1305  * Only dirty target pages are sent. Note that the host page size may
1306  * be a huge page for this block.
1307  *
1308  * Returns the number of pages written or negative on error
1309  *
1310  * @rs: current RAM state
1311  * @ms: current migration state
1312  * @pss: data about the page we want to send
1313  * @last_stage: if we are at the completion stage
1314  */
1315 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1316                               bool last_stage)
1317 {
1318     int tmppages, pages = 0;
1319     size_t pagesize_bits =
1320         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1321
1322     do {
1323         tmppages = ram_save_target_page(rs, pss, last_stage);
1324         if (tmppages < 0) {
1325             return tmppages;
1326         }
1327
1328         pages += tmppages;
1329         pss->page++;
1330     } while (pss->page & (pagesize_bits - 1));
1331
1332     /* The offset we leave with is the last one we looked at */
1333     pss->page--;
1334     return pages;
1335 }
1336
1337 /**
1338  * ram_find_and_save_block: finds a dirty page and sends it to f
1339  *
1340  * Called within an RCU critical section.
1341  *
1342  * Returns the number of pages written where zero means no dirty pages
1343  *
1344  * @rs: current RAM state
1345  * @last_stage: if we are at the completion stage
1346  *
1347  * On systems where host-page-size > target-page-size it will send all the
1348  * pages in a host page that are dirty.
1349  */
1350
1351 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1352 {
1353     PageSearchStatus pss;
1354     int pages = 0;
1355     bool again, found;
1356
1357     /* No dirty page as there is zero RAM */
1358     if (!ram_bytes_total()) {
1359         return pages;
1360     }
1361
1362     pss.block = rs->last_seen_block;
1363     pss.page = rs->last_page;
1364     pss.complete_round = false;
1365
1366     if (!pss.block) {
1367         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1368     }
1369
1370     do {
1371         again = true;
1372         found = get_queued_page(rs, &pss);
1373
1374         if (!found) {
1375             /* priority queue empty, so just search for something dirty */
1376             found = find_dirty_block(rs, &pss, &again);
1377         }
1378
1379         if (found) {
1380             pages = ram_save_host_page(rs, &pss, last_stage);
1381         }
1382     } while (!pages && again);
1383
1384     rs->last_seen_block = pss.block;
1385     rs->last_page = pss.page;
1386
1387     return pages;
1388 }
1389
1390 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1391 {
1392     uint64_t pages = size / TARGET_PAGE_SIZE;
1393     RAMState *rs = &ram_state;
1394
1395     if (zero) {
1396         rs->zero_pages += pages;
1397     } else {
1398         rs->norm_pages += pages;
1399         rs->bytes_transferred += size;
1400         qemu_update_position(f, size);
1401     }
1402 }
1403
1404 uint64_t ram_bytes_total(void)
1405 {
1406     RAMBlock *block;
1407     uint64_t total = 0;
1408
1409     rcu_read_lock();
1410     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1411         total += block->used_length;
1412     rcu_read_unlock();
1413     return total;
1414 }
1415
1416 void free_xbzrle_decoded_buf(void)
1417 {
1418     g_free(xbzrle_decoded_buf);
1419     xbzrle_decoded_buf = NULL;
1420 }
1421
1422 static void ram_migration_cleanup(void *opaque)
1423 {
1424     RAMBlock *block;
1425
1426     /* caller have hold iothread lock or is in a bh, so there is
1427      * no writing race against this migration_bitmap
1428      */
1429     memory_global_dirty_log_stop();
1430
1431     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1432         g_free(block->bmap);
1433         block->bmap = NULL;
1434         g_free(block->unsentmap);
1435         block->unsentmap = NULL;
1436     }
1437
1438     XBZRLE_cache_lock();
1439     if (XBZRLE.cache) {
1440         cache_fini(XBZRLE.cache);
1441         g_free(XBZRLE.encoded_buf);
1442         g_free(XBZRLE.current_buf);
1443         g_free(ZERO_TARGET_PAGE);
1444         XBZRLE.cache = NULL;
1445         XBZRLE.encoded_buf = NULL;
1446         XBZRLE.current_buf = NULL;
1447     }
1448     XBZRLE_cache_unlock();
1449 }
1450
1451 static void ram_state_reset(RAMState *rs)
1452 {
1453     rs->last_seen_block = NULL;
1454     rs->last_sent_block = NULL;
1455     rs->last_page = 0;
1456     rs->last_version = ram_list.version;
1457     rs->ram_bulk_stage = true;
1458 }
1459
1460 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1461
1462 /*
1463  * 'expected' is the value you expect the bitmap mostly to be full
1464  * of; it won't bother printing lines that are all this value.
1465  * If 'todump' is null the migration bitmap is dumped.
1466  */
1467 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1468                            unsigned long pages)
1469 {
1470     int64_t cur;
1471     int64_t linelen = 128;
1472     char linebuf[129];
1473
1474     for (cur = 0; cur < pages; cur += linelen) {
1475         int64_t curb;
1476         bool found = false;
1477         /*
1478          * Last line; catch the case where the line length
1479          * is longer than remaining ram
1480          */
1481         if (cur + linelen > pages) {
1482             linelen = pages - cur;
1483         }
1484         for (curb = 0; curb < linelen; curb++) {
1485             bool thisbit = test_bit(cur + curb, todump);
1486             linebuf[curb] = thisbit ? '1' : '.';
1487             found = found || (thisbit != expected);
1488         }
1489         if (found) {
1490             linebuf[curb] = '\0';
1491             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1492         }
1493     }
1494 }
1495
1496 /* **** functions for postcopy ***** */
1497
1498 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1499 {
1500     struct RAMBlock *block;
1501
1502     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1503         unsigned long *bitmap = block->bmap;
1504         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1505         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1506
1507         while (run_start < range) {
1508             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1509             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1510                               (run_end - run_start) << TARGET_PAGE_BITS);
1511             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1512         }
1513     }
1514 }
1515
1516 /**
1517  * postcopy_send_discard_bm_ram: discard a RAMBlock
1518  *
1519  * Returns zero on success
1520  *
1521  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1522  * Note: At this point the 'unsentmap' is the processed bitmap combined
1523  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1524  *
1525  * @ms: current migration state
1526  * @pds: state for postcopy
1527  * @start: RAMBlock starting page
1528  * @length: RAMBlock size
1529  */
1530 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1531                                         PostcopyDiscardState *pds,
1532                                         RAMBlock *block)
1533 {
1534     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1535     unsigned long current;
1536     unsigned long *unsentmap = block->unsentmap;
1537
1538     for (current = 0; current < end; ) {
1539         unsigned long one = find_next_bit(unsentmap, end, current);
1540
1541         if (one <= end) {
1542             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1543             unsigned long discard_length;
1544
1545             if (zero >= end) {
1546                 discard_length = end - one;
1547             } else {
1548                 discard_length = zero - one;
1549             }
1550             if (discard_length) {
1551                 postcopy_discard_send_range(ms, pds, one, discard_length);
1552             }
1553             current = one + discard_length;
1554         } else {
1555             current = one;
1556         }
1557     }
1558
1559     return 0;
1560 }
1561
1562 /**
1563  * postcopy_each_ram_send_discard: discard all RAMBlocks
1564  *
1565  * Returns 0 for success or negative for error
1566  *
1567  * Utility for the outgoing postcopy code.
1568  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1569  *   passing it bitmap indexes and name.
1570  * (qemu_ram_foreach_block ends up passing unscaled lengths
1571  *  which would mean postcopy code would have to deal with target page)
1572  *
1573  * @ms: current migration state
1574  */
1575 static int postcopy_each_ram_send_discard(MigrationState *ms)
1576 {
1577     struct RAMBlock *block;
1578     int ret;
1579
1580     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1581         PostcopyDiscardState *pds =
1582             postcopy_discard_send_init(ms, block->idstr);
1583
1584         /*
1585          * Postcopy sends chunks of bitmap over the wire, but it
1586          * just needs indexes at this point, avoids it having
1587          * target page specific code.
1588          */
1589         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1590         postcopy_discard_send_finish(ms, pds);
1591         if (ret) {
1592             return ret;
1593         }
1594     }
1595
1596     return 0;
1597 }
1598
1599 /**
1600  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1601  *
1602  * Helper for postcopy_chunk_hostpages; it's called twice to
1603  * canonicalize the two bitmaps, that are similar, but one is
1604  * inverted.
1605  *
1606  * Postcopy requires that all target pages in a hostpage are dirty or
1607  * clean, not a mix.  This function canonicalizes the bitmaps.
1608  *
1609  * @ms: current migration state
1610  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1611  *               otherwise we need to canonicalize partially dirty host pages
1612  * @block: block that contains the page we want to canonicalize
1613  * @pds: state for postcopy
1614  */
1615 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1616                                           RAMBlock *block,
1617                                           PostcopyDiscardState *pds)
1618 {
1619     RAMState *rs = &ram_state;
1620     unsigned long *bitmap = block->bmap;
1621     unsigned long *unsentmap = block->unsentmap;
1622     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1623     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1624     unsigned long run_start;
1625
1626     if (block->page_size == TARGET_PAGE_SIZE) {
1627         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1628         return;
1629     }
1630
1631     if (unsent_pass) {
1632         /* Find a sent page */
1633         run_start = find_next_zero_bit(unsentmap, pages, 0);
1634     } else {
1635         /* Find a dirty page */
1636         run_start = find_next_bit(bitmap, pages, 0);
1637     }
1638
1639     while (run_start < pages) {
1640         bool do_fixup = false;
1641         unsigned long fixup_start_addr;
1642         unsigned long host_offset;
1643
1644         /*
1645          * If the start of this run of pages is in the middle of a host
1646          * page, then we need to fixup this host page.
1647          */
1648         host_offset = run_start % host_ratio;
1649         if (host_offset) {
1650             do_fixup = true;
1651             run_start -= host_offset;
1652             fixup_start_addr = run_start;
1653             /* For the next pass */
1654             run_start = run_start + host_ratio;
1655         } else {
1656             /* Find the end of this run */
1657             unsigned long run_end;
1658             if (unsent_pass) {
1659                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1660             } else {
1661                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1662             }
1663             /*
1664              * If the end isn't at the start of a host page, then the
1665              * run doesn't finish at the end of a host page
1666              * and we need to discard.
1667              */
1668             host_offset = run_end % host_ratio;
1669             if (host_offset) {
1670                 do_fixup = true;
1671                 fixup_start_addr = run_end - host_offset;
1672                 /*
1673                  * This host page has gone, the next loop iteration starts
1674                  * from after the fixup
1675                  */
1676                 run_start = fixup_start_addr + host_ratio;
1677             } else {
1678                 /*
1679                  * No discards on this iteration, next loop starts from
1680                  * next sent/dirty page
1681                  */
1682                 run_start = run_end + 1;
1683             }
1684         }
1685
1686         if (do_fixup) {
1687             unsigned long page;
1688
1689             /* Tell the destination to discard this page */
1690             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1691                 /* For the unsent_pass we:
1692                  *     discard partially sent pages
1693                  * For the !unsent_pass (dirty) we:
1694                  *     discard partially dirty pages that were sent
1695                  *     (any partially sent pages were already discarded
1696                  *     by the previous unsent_pass)
1697                  */
1698                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1699                                             host_ratio);
1700             }
1701
1702             /* Clean up the bitmap */
1703             for (page = fixup_start_addr;
1704                  page < fixup_start_addr + host_ratio; page++) {
1705                 /* All pages in this host page are now not sent */
1706                 set_bit(page, unsentmap);
1707
1708                 /*
1709                  * Remark them as dirty, updating the count for any pages
1710                  * that weren't previously dirty.
1711                  */
1712                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1713             }
1714         }
1715
1716         if (unsent_pass) {
1717             /* Find the next sent page for the next iteration */
1718             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1719         } else {
1720             /* Find the next dirty page for the next iteration */
1721             run_start = find_next_bit(bitmap, pages, run_start);
1722         }
1723     }
1724 }
1725
1726 /**
1727  * postcopy_chuck_hostpages: discrad any partially sent host page
1728  *
1729  * Utility for the outgoing postcopy code.
1730  *
1731  * Discard any partially sent host-page size chunks, mark any partially
1732  * dirty host-page size chunks as all dirty.  In this case the host-page
1733  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1734  *
1735  * Returns zero on success
1736  *
1737  * @ms: current migration state
1738  * @block: block we want to work with
1739  */
1740 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1741 {
1742     PostcopyDiscardState *pds =
1743         postcopy_discard_send_init(ms, block->idstr);
1744
1745     /* First pass: Discard all partially sent host pages */
1746     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1747     /*
1748      * Second pass: Ensure that all partially dirty host pages are made
1749      * fully dirty.
1750      */
1751     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1752
1753     postcopy_discard_send_finish(ms, pds);
1754     return 0;
1755 }
1756
1757 /**
1758  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1759  *
1760  * Returns zero on success
1761  *
1762  * Transmit the set of pages to be discarded after precopy to the target
1763  * these are pages that:
1764  *     a) Have been previously transmitted but are now dirty again
1765  *     b) Pages that have never been transmitted, this ensures that
1766  *        any pages on the destination that have been mapped by background
1767  *        tasks get discarded (transparent huge pages is the specific concern)
1768  * Hopefully this is pretty sparse
1769  *
1770  * @ms: current migration state
1771  */
1772 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1773 {
1774     RAMState *rs = &ram_state;
1775     RAMBlock *block;
1776     int ret;
1777
1778     rcu_read_lock();
1779
1780     /* This should be our last sync, the src is now paused */
1781     migration_bitmap_sync(rs);
1782
1783     /* Easiest way to make sure we don't resume in the middle of a host-page */
1784     rs->last_seen_block = NULL;
1785     rs->last_sent_block = NULL;
1786     rs->last_page = 0;
1787
1788     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1789         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1790         unsigned long *bitmap = block->bmap;
1791         unsigned long *unsentmap = block->unsentmap;
1792
1793         if (!unsentmap) {
1794             /* We don't have a safe way to resize the sentmap, so
1795              * if the bitmap was resized it will be NULL at this
1796              * point.
1797              */
1798             error_report("migration ram resized during precopy phase");
1799             rcu_read_unlock();
1800             return -EINVAL;
1801         }
1802         /* Deal with TPS != HPS and huge pages */
1803         ret = postcopy_chunk_hostpages(ms, block);
1804         if (ret) {
1805             rcu_read_unlock();
1806             return ret;
1807         }
1808
1809         /*
1810          * Update the unsentmap to be unsentmap = unsentmap | dirty
1811          */
1812         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1813 #ifdef DEBUG_POSTCOPY
1814         ram_debug_dump_bitmap(unsentmap, true, pages);
1815 #endif
1816     }
1817     trace_ram_postcopy_send_discard_bitmap();
1818
1819     ret = postcopy_each_ram_send_discard(ms);
1820     rcu_read_unlock();
1821
1822     return ret;
1823 }
1824
1825 /**
1826  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1827  *
1828  * Returns zero on success
1829  *
1830  * @rbname: name of the RAMBlock of the request. NULL means the
1831  *          same that last one.
1832  * @start: RAMBlock starting page
1833  * @length: RAMBlock size
1834  */
1835 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1836 {
1837     int ret = -1;
1838
1839     trace_ram_discard_range(rbname, start, length);
1840
1841     rcu_read_lock();
1842     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1843
1844     if (!rb) {
1845         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1846         goto err;
1847     }
1848
1849     ret = ram_block_discard_range(rb, start, length);
1850
1851 err:
1852     rcu_read_unlock();
1853
1854     return ret;
1855 }
1856
1857 static int ram_state_init(RAMState *rs)
1858 {
1859     memset(rs, 0, sizeof(*rs));
1860     qemu_mutex_init(&rs->bitmap_mutex);
1861     qemu_mutex_init(&rs->src_page_req_mutex);
1862     QSIMPLEQ_INIT(&rs->src_page_requests);
1863
1864     if (migrate_use_xbzrle()) {
1865         XBZRLE_cache_lock();
1866         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1867         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1868                                   TARGET_PAGE_SIZE,
1869                                   TARGET_PAGE_SIZE);
1870         if (!XBZRLE.cache) {
1871             XBZRLE_cache_unlock();
1872             error_report("Error creating cache");
1873             return -1;
1874         }
1875         XBZRLE_cache_unlock();
1876
1877         /* We prefer not to abort if there is no memory */
1878         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1879         if (!XBZRLE.encoded_buf) {
1880             error_report("Error allocating encoded_buf");
1881             return -1;
1882         }
1883
1884         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1885         if (!XBZRLE.current_buf) {
1886             error_report("Error allocating current_buf");
1887             g_free(XBZRLE.encoded_buf);
1888             XBZRLE.encoded_buf = NULL;
1889             return -1;
1890         }
1891     }
1892
1893     /* For memory_global_dirty_log_start below.  */
1894     qemu_mutex_lock_iothread();
1895
1896     qemu_mutex_lock_ramlist();
1897     rcu_read_lock();
1898     ram_state_reset(rs);
1899
1900     /* Skip setting bitmap if there is no RAM */
1901     if (ram_bytes_total()) {
1902         RAMBlock *block;
1903
1904         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1905             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1906
1907             block->bmap = bitmap_new(pages);
1908             bitmap_set(block->bmap, 0, pages);
1909             if (migrate_postcopy_ram()) {
1910                 block->unsentmap = bitmap_new(pages);
1911                 bitmap_set(block->unsentmap, 0, pages);
1912             }
1913         }
1914     }
1915
1916     /*
1917      * Count the total number of pages used by ram blocks not including any
1918      * gaps due to alignment or unplugs.
1919      */
1920     rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1921
1922     memory_global_dirty_log_start();
1923     migration_bitmap_sync(rs);
1924     qemu_mutex_unlock_ramlist();
1925     qemu_mutex_unlock_iothread();
1926     rcu_read_unlock();
1927
1928     return 0;
1929 }
1930
1931 /*
1932  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1933  * long-running RCU critical section.  When rcu-reclaims in the code
1934  * start to become numerous it will be necessary to reduce the
1935  * granularity of these critical sections.
1936  */
1937
1938 /**
1939  * ram_save_setup: Setup RAM for migration
1940  *
1941  * Returns zero to indicate success and negative for error
1942  *
1943  * @f: QEMUFile where to send the data
1944  * @opaque: RAMState pointer
1945  */
1946 static int ram_save_setup(QEMUFile *f, void *opaque)
1947 {
1948     RAMState *rs = opaque;
1949     RAMBlock *block;
1950
1951     /* migration has already setup the bitmap, reuse it. */
1952     if (!migration_in_colo_state()) {
1953         if (ram_state_init(rs) < 0) {
1954             return -1;
1955          }
1956     }
1957     rs->f = f;
1958
1959     rcu_read_lock();
1960
1961     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1962
1963     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1964         qemu_put_byte(f, strlen(block->idstr));
1965         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1966         qemu_put_be64(f, block->used_length);
1967         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1968             qemu_put_be64(f, block->page_size);
1969         }
1970     }
1971
1972     rcu_read_unlock();
1973
1974     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1975     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1976
1977     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1978
1979     return 0;
1980 }
1981
1982 /**
1983  * ram_save_iterate: iterative stage for migration
1984  *
1985  * Returns zero to indicate success and negative for error
1986  *
1987  * @f: QEMUFile where to send the data
1988  * @opaque: RAMState pointer
1989  */
1990 static int ram_save_iterate(QEMUFile *f, void *opaque)
1991 {
1992     RAMState *rs = opaque;
1993     int ret;
1994     int i;
1995     int64_t t0;
1996     int done = 0;
1997
1998     rcu_read_lock();
1999     if (ram_list.version != rs->last_version) {
2000         ram_state_reset(rs);
2001     }
2002
2003     /* Read version before ram_list.blocks */
2004     smp_rmb();
2005
2006     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2007
2008     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2009     i = 0;
2010     while ((ret = qemu_file_rate_limit(f)) == 0) {
2011         int pages;
2012
2013         pages = ram_find_and_save_block(rs, false);
2014         /* no more pages to sent */
2015         if (pages == 0) {
2016             done = 1;
2017             break;
2018         }
2019         rs->iterations++;
2020
2021         /* we want to check in the 1st loop, just in case it was the 1st time
2022            and we had to sync the dirty bitmap.
2023            qemu_get_clock_ns() is a bit expensive, so we only check each some
2024            iterations
2025         */
2026         if ((i & 63) == 0) {
2027             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2028             if (t1 > MAX_WAIT) {
2029                 trace_ram_save_iterate_big_wait(t1, i);
2030                 break;
2031             }
2032         }
2033         i++;
2034     }
2035     flush_compressed_data(rs);
2036     rcu_read_unlock();
2037
2038     /*
2039      * Must occur before EOS (or any QEMUFile operation)
2040      * because of RDMA protocol.
2041      */
2042     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2043
2044     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2045     rs->bytes_transferred += 8;
2046
2047     ret = qemu_file_get_error(f);
2048     if (ret < 0) {
2049         return ret;
2050     }
2051
2052     return done;
2053 }
2054
2055 /**
2056  * ram_save_complete: function called to send the remaining amount of ram
2057  *
2058  * Returns zero to indicate success
2059  *
2060  * Called with iothread lock
2061  *
2062  * @f: QEMUFile where to send the data
2063  * @opaque: RAMState pointer
2064  */
2065 static int ram_save_complete(QEMUFile *f, void *opaque)
2066 {
2067     RAMState *rs = opaque;
2068
2069     rcu_read_lock();
2070
2071     if (!migration_in_postcopy()) {
2072         migration_bitmap_sync(rs);
2073     }
2074
2075     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2076
2077     /* try transferring iterative blocks of memory */
2078
2079     /* flush all remaining blocks regardless of rate limiting */
2080     while (true) {
2081         int pages;
2082
2083         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2084         /* no more blocks to sent */
2085         if (pages == 0) {
2086             break;
2087         }
2088     }
2089
2090     flush_compressed_data(rs);
2091     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2092
2093     rcu_read_unlock();
2094
2095     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2096
2097     return 0;
2098 }
2099
2100 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2101                              uint64_t *non_postcopiable_pending,
2102                              uint64_t *postcopiable_pending)
2103 {
2104     RAMState *rs = opaque;
2105     uint64_t remaining_size;
2106
2107     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2108
2109     if (!migration_in_postcopy() &&
2110         remaining_size < max_size) {
2111         qemu_mutex_lock_iothread();
2112         rcu_read_lock();
2113         migration_bitmap_sync(rs);
2114         rcu_read_unlock();
2115         qemu_mutex_unlock_iothread();
2116         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2117     }
2118
2119     /* We can do postcopy, and all the data is postcopiable */
2120     *postcopiable_pending += remaining_size;
2121 }
2122
2123 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2124 {
2125     unsigned int xh_len;
2126     int xh_flags;
2127     uint8_t *loaded_data;
2128
2129     if (!xbzrle_decoded_buf) {
2130         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2131     }
2132     loaded_data = xbzrle_decoded_buf;
2133
2134     /* extract RLE header */
2135     xh_flags = qemu_get_byte(f);
2136     xh_len = qemu_get_be16(f);
2137
2138     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2139         error_report("Failed to load XBZRLE page - wrong compression!");
2140         return -1;
2141     }
2142
2143     if (xh_len > TARGET_PAGE_SIZE) {
2144         error_report("Failed to load XBZRLE page - len overflow!");
2145         return -1;
2146     }
2147     /* load data and decode */
2148     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2149
2150     /* decode RLE */
2151     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2152                              TARGET_PAGE_SIZE) == -1) {
2153         error_report("Failed to load XBZRLE page - decode error!");
2154         return -1;
2155     }
2156
2157     return 0;
2158 }
2159
2160 /**
2161  * ram_block_from_stream: read a RAMBlock id from the migration stream
2162  *
2163  * Must be called from within a rcu critical section.
2164  *
2165  * Returns a pointer from within the RCU-protected ram_list.
2166  *
2167  * @f: QEMUFile where to read the data from
2168  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2169  */
2170 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2171 {
2172     static RAMBlock *block = NULL;
2173     char id[256];
2174     uint8_t len;
2175
2176     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2177         if (!block) {
2178             error_report("Ack, bad migration stream!");
2179             return NULL;
2180         }
2181         return block;
2182     }
2183
2184     len = qemu_get_byte(f);
2185     qemu_get_buffer(f, (uint8_t *)id, len);
2186     id[len] = 0;
2187
2188     block = qemu_ram_block_by_name(id);
2189     if (!block) {
2190         error_report("Can't find block %s", id);
2191         return NULL;
2192     }
2193
2194     return block;
2195 }
2196
2197 static inline void *host_from_ram_block_offset(RAMBlock *block,
2198                                                ram_addr_t offset)
2199 {
2200     if (!offset_in_ramblock(block, offset)) {
2201         return NULL;
2202     }
2203
2204     return block->host + offset;
2205 }
2206
2207 /**
2208  * ram_handle_compressed: handle the zero page case
2209  *
2210  * If a page (or a whole RDMA chunk) has been
2211  * determined to be zero, then zap it.
2212  *
2213  * @host: host address for the zero page
2214  * @ch: what the page is filled from.  We only support zero
2215  * @size: size of the zero page
2216  */
2217 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2218 {
2219     if (ch != 0 || !is_zero_range(host, size)) {
2220         memset(host, ch, size);
2221     }
2222 }
2223
2224 static void *do_data_decompress(void *opaque)
2225 {
2226     DecompressParam *param = opaque;
2227     unsigned long pagesize;
2228     uint8_t *des;
2229     int len;
2230
2231     qemu_mutex_lock(&param->mutex);
2232     while (!param->quit) {
2233         if (param->des) {
2234             des = param->des;
2235             len = param->len;
2236             param->des = 0;
2237             qemu_mutex_unlock(&param->mutex);
2238
2239             pagesize = TARGET_PAGE_SIZE;
2240             /* uncompress() will return failed in some case, especially
2241              * when the page is dirted when doing the compression, it's
2242              * not a problem because the dirty page will be retransferred
2243              * and uncompress() won't break the data in other pages.
2244              */
2245             uncompress((Bytef *)des, &pagesize,
2246                        (const Bytef *)param->compbuf, len);
2247
2248             qemu_mutex_lock(&decomp_done_lock);
2249             param->done = true;
2250             qemu_cond_signal(&decomp_done_cond);
2251             qemu_mutex_unlock(&decomp_done_lock);
2252
2253             qemu_mutex_lock(&param->mutex);
2254         } else {
2255             qemu_cond_wait(&param->cond, &param->mutex);
2256         }
2257     }
2258     qemu_mutex_unlock(&param->mutex);
2259
2260     return NULL;
2261 }
2262
2263 static void wait_for_decompress_done(void)
2264 {
2265     int idx, thread_count;
2266
2267     if (!migrate_use_compression()) {
2268         return;
2269     }
2270
2271     thread_count = migrate_decompress_threads();
2272     qemu_mutex_lock(&decomp_done_lock);
2273     for (idx = 0; idx < thread_count; idx++) {
2274         while (!decomp_param[idx].done) {
2275             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2276         }
2277     }
2278     qemu_mutex_unlock(&decomp_done_lock);
2279 }
2280
2281 void migrate_decompress_threads_create(void)
2282 {
2283     int i, thread_count;
2284
2285     thread_count = migrate_decompress_threads();
2286     decompress_threads = g_new0(QemuThread, thread_count);
2287     decomp_param = g_new0(DecompressParam, thread_count);
2288     qemu_mutex_init(&decomp_done_lock);
2289     qemu_cond_init(&decomp_done_cond);
2290     for (i = 0; i < thread_count; i++) {
2291         qemu_mutex_init(&decomp_param[i].mutex);
2292         qemu_cond_init(&decomp_param[i].cond);
2293         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2294         decomp_param[i].done = true;
2295         decomp_param[i].quit = false;
2296         qemu_thread_create(decompress_threads + i, "decompress",
2297                            do_data_decompress, decomp_param + i,
2298                            QEMU_THREAD_JOINABLE);
2299     }
2300 }
2301
2302 void migrate_decompress_threads_join(void)
2303 {
2304     int i, thread_count;
2305
2306     thread_count = migrate_decompress_threads();
2307     for (i = 0; i < thread_count; i++) {
2308         qemu_mutex_lock(&decomp_param[i].mutex);
2309         decomp_param[i].quit = true;
2310         qemu_cond_signal(&decomp_param[i].cond);
2311         qemu_mutex_unlock(&decomp_param[i].mutex);
2312     }
2313     for (i = 0; i < thread_count; i++) {
2314         qemu_thread_join(decompress_threads + i);
2315         qemu_mutex_destroy(&decomp_param[i].mutex);
2316         qemu_cond_destroy(&decomp_param[i].cond);
2317         g_free(decomp_param[i].compbuf);
2318     }
2319     g_free(decompress_threads);
2320     g_free(decomp_param);
2321     decompress_threads = NULL;
2322     decomp_param = NULL;
2323 }
2324
2325 static void decompress_data_with_multi_threads(QEMUFile *f,
2326                                                void *host, int len)
2327 {
2328     int idx, thread_count;
2329
2330     thread_count = migrate_decompress_threads();
2331     qemu_mutex_lock(&decomp_done_lock);
2332     while (true) {
2333         for (idx = 0; idx < thread_count; idx++) {
2334             if (decomp_param[idx].done) {
2335                 decomp_param[idx].done = false;
2336                 qemu_mutex_lock(&decomp_param[idx].mutex);
2337                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2338                 decomp_param[idx].des = host;
2339                 decomp_param[idx].len = len;
2340                 qemu_cond_signal(&decomp_param[idx].cond);
2341                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2342                 break;
2343             }
2344         }
2345         if (idx < thread_count) {
2346             break;
2347         } else {
2348             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2349         }
2350     }
2351     qemu_mutex_unlock(&decomp_done_lock);
2352 }
2353
2354 /**
2355  * ram_postcopy_incoming_init: allocate postcopy data structures
2356  *
2357  * Returns 0 for success and negative if there was one error
2358  *
2359  * @mis: current migration incoming state
2360  *
2361  * Allocate data structures etc needed by incoming migration with
2362  * postcopy-ram. postcopy-ram's similarly names
2363  * postcopy_ram_incoming_init does the work.
2364  */
2365 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2366 {
2367     unsigned long ram_pages = last_ram_page();
2368
2369     return postcopy_ram_incoming_init(mis, ram_pages);
2370 }
2371
2372 /**
2373  * ram_load_postcopy: load a page in postcopy case
2374  *
2375  * Returns 0 for success or -errno in case of error
2376  *
2377  * Called in postcopy mode by ram_load().
2378  * rcu_read_lock is taken prior to this being called.
2379  *
2380  * @f: QEMUFile where to send the data
2381  */
2382 static int ram_load_postcopy(QEMUFile *f)
2383 {
2384     int flags = 0, ret = 0;
2385     bool place_needed = false;
2386     bool matching_page_sizes = false;
2387     MigrationIncomingState *mis = migration_incoming_get_current();
2388     /* Temporary page that is later 'placed' */
2389     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2390     void *last_host = NULL;
2391     bool all_zero = false;
2392
2393     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2394         ram_addr_t addr;
2395         void *host = NULL;
2396         void *page_buffer = NULL;
2397         void *place_source = NULL;
2398         RAMBlock *block = NULL;
2399         uint8_t ch;
2400
2401         addr = qemu_get_be64(f);
2402         flags = addr & ~TARGET_PAGE_MASK;
2403         addr &= TARGET_PAGE_MASK;
2404
2405         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2406         place_needed = false;
2407         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2408             block = ram_block_from_stream(f, flags);
2409
2410             host = host_from_ram_block_offset(block, addr);
2411             if (!host) {
2412                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2413                 ret = -EINVAL;
2414                 break;
2415             }
2416             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2417             /*
2418              * Postcopy requires that we place whole host pages atomically;
2419              * these may be huge pages for RAMBlocks that are backed by
2420              * hugetlbfs.
2421              * To make it atomic, the data is read into a temporary page
2422              * that's moved into place later.
2423              * The migration protocol uses,  possibly smaller, target-pages
2424              * however the source ensures it always sends all the components
2425              * of a host page in order.
2426              */
2427             page_buffer = postcopy_host_page +
2428                           ((uintptr_t)host & (block->page_size - 1));
2429             /* If all TP are zero then we can optimise the place */
2430             if (!((uintptr_t)host & (block->page_size - 1))) {
2431                 all_zero = true;
2432             } else {
2433                 /* not the 1st TP within the HP */
2434                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2435                     error_report("Non-sequential target page %p/%p",
2436                                   host, last_host);
2437                     ret = -EINVAL;
2438                     break;
2439                 }
2440             }
2441
2442
2443             /*
2444              * If it's the last part of a host page then we place the host
2445              * page
2446              */
2447             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2448                                      (block->page_size - 1)) == 0;
2449             place_source = postcopy_host_page;
2450         }
2451         last_host = host;
2452
2453         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2454         case RAM_SAVE_FLAG_COMPRESS:
2455             ch = qemu_get_byte(f);
2456             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2457             if (ch) {
2458                 all_zero = false;
2459             }
2460             break;
2461
2462         case RAM_SAVE_FLAG_PAGE:
2463             all_zero = false;
2464             if (!place_needed || !matching_page_sizes) {
2465                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2466             } else {
2467                 /* Avoids the qemu_file copy during postcopy, which is
2468                  * going to do a copy later; can only do it when we
2469                  * do this read in one go (matching page sizes)
2470                  */
2471                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2472                                          TARGET_PAGE_SIZE);
2473             }
2474             break;
2475         case RAM_SAVE_FLAG_EOS:
2476             /* normal exit */
2477             break;
2478         default:
2479             error_report("Unknown combination of migration flags: %#x"
2480                          " (postcopy mode)", flags);
2481             ret = -EINVAL;
2482         }
2483
2484         if (place_needed) {
2485             /* This gets called at the last target page in the host page */
2486             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2487
2488             if (all_zero) {
2489                 ret = postcopy_place_page_zero(mis, place_dest,
2490                                                block->page_size);
2491             } else {
2492                 ret = postcopy_place_page(mis, place_dest,
2493                                           place_source, block->page_size);
2494             }
2495         }
2496         if (!ret) {
2497             ret = qemu_file_get_error(f);
2498         }
2499     }
2500
2501     return ret;
2502 }
2503
2504 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2505 {
2506     int flags = 0, ret = 0;
2507     static uint64_t seq_iter;
2508     int len = 0;
2509     /*
2510      * If system is running in postcopy mode, page inserts to host memory must
2511      * be atomic
2512      */
2513     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2514     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2515     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2516
2517     seq_iter++;
2518
2519     if (version_id != 4) {
2520         ret = -EINVAL;
2521     }
2522
2523     /* This RCU critical section can be very long running.
2524      * When RCU reclaims in the code start to become numerous,
2525      * it will be necessary to reduce the granularity of this
2526      * critical section.
2527      */
2528     rcu_read_lock();
2529
2530     if (postcopy_running) {
2531         ret = ram_load_postcopy(f);
2532     }
2533
2534     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2535         ram_addr_t addr, total_ram_bytes;
2536         void *host = NULL;
2537         uint8_t ch;
2538
2539         addr = qemu_get_be64(f);
2540         flags = addr & ~TARGET_PAGE_MASK;
2541         addr &= TARGET_PAGE_MASK;
2542
2543         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2544                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2545             RAMBlock *block = ram_block_from_stream(f, flags);
2546
2547             host = host_from_ram_block_offset(block, addr);
2548             if (!host) {
2549                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2550                 ret = -EINVAL;
2551                 break;
2552             }
2553         }
2554
2555         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2556         case RAM_SAVE_FLAG_MEM_SIZE:
2557             /* Synchronize RAM block list */
2558             total_ram_bytes = addr;
2559             while (!ret && total_ram_bytes) {
2560                 RAMBlock *block;
2561                 char id[256];
2562                 ram_addr_t length;
2563
2564                 len = qemu_get_byte(f);
2565                 qemu_get_buffer(f, (uint8_t *)id, len);
2566                 id[len] = 0;
2567                 length = qemu_get_be64(f);
2568
2569                 block = qemu_ram_block_by_name(id);
2570                 if (block) {
2571                     if (length != block->used_length) {
2572                         Error *local_err = NULL;
2573
2574                         ret = qemu_ram_resize(block, length,
2575                                               &local_err);
2576                         if (local_err) {
2577                             error_report_err(local_err);
2578                         }
2579                     }
2580                     /* For postcopy we need to check hugepage sizes match */
2581                     if (postcopy_advised &&
2582                         block->page_size != qemu_host_page_size) {
2583                         uint64_t remote_page_size = qemu_get_be64(f);
2584                         if (remote_page_size != block->page_size) {
2585                             error_report("Mismatched RAM page size %s "
2586                                          "(local) %zd != %" PRId64,
2587                                          id, block->page_size,
2588                                          remote_page_size);
2589                             ret = -EINVAL;
2590                         }
2591                     }
2592                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2593                                           block->idstr);
2594                 } else {
2595                     error_report("Unknown ramblock \"%s\", cannot "
2596                                  "accept migration", id);
2597                     ret = -EINVAL;
2598                 }
2599
2600                 total_ram_bytes -= length;
2601             }
2602             break;
2603
2604         case RAM_SAVE_FLAG_COMPRESS:
2605             ch = qemu_get_byte(f);
2606             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2607             break;
2608
2609         case RAM_SAVE_FLAG_PAGE:
2610             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2611             break;
2612
2613         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2614             len = qemu_get_be32(f);
2615             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2616                 error_report("Invalid compressed data length: %d", len);
2617                 ret = -EINVAL;
2618                 break;
2619             }
2620             decompress_data_with_multi_threads(f, host, len);
2621             break;
2622
2623         case RAM_SAVE_FLAG_XBZRLE:
2624             if (load_xbzrle(f, addr, host) < 0) {
2625                 error_report("Failed to decompress XBZRLE page at "
2626                              RAM_ADDR_FMT, addr);
2627                 ret = -EINVAL;
2628                 break;
2629             }
2630             break;
2631         case RAM_SAVE_FLAG_EOS:
2632             /* normal exit */
2633             break;
2634         default:
2635             if (flags & RAM_SAVE_FLAG_HOOK) {
2636                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2637             } else {
2638                 error_report("Unknown combination of migration flags: %#x",
2639                              flags);
2640                 ret = -EINVAL;
2641             }
2642         }
2643         if (!ret) {
2644             ret = qemu_file_get_error(f);
2645         }
2646     }
2647
2648     wait_for_decompress_done();
2649     rcu_read_unlock();
2650     trace_ram_load_complete(ret, seq_iter);
2651     return ret;
2652 }
2653
2654 static SaveVMHandlers savevm_ram_handlers = {
2655     .save_live_setup = ram_save_setup,
2656     .save_live_iterate = ram_save_iterate,
2657     .save_live_complete_postcopy = ram_save_complete,
2658     .save_live_complete_precopy = ram_save_complete,
2659     .save_live_pending = ram_save_pending,
2660     .load_state = ram_load,
2661     .cleanup = ram_migration_cleanup,
2662 };
2663
2664 void ram_mig_init(void)
2665 {
2666     qemu_mutex_init(&XBZRLE.lock);
2667     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2668 }